diff --git a/.github/workflows/archiver.yml b/.github/workflows/archiver.yml index 114a2ec0ca..5ac17d45a2 100644 --- a/.github/workflows/archiver.yml +++ b/.github/workflows/archiver.yml @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2024) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: D. Massaro, A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. #---------------------------------------------------------------------------------------------------------------------------------- diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 7dd6a2f963..72ffe64b17 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, S. Roiser, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. + name: C/C++ CI on: diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py index b21e98934e..b8ac77c3b8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: D. Massaro, O. Mattelaer, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. # AV - Rename the plugin as CUDACPP_OUTPUT (even if the madgraph4gpu directory is still called CUDACPP_SA_OUTPUT) # This can be used in mg5amcnlo in one of two ways: diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu index 3679e681e1..405faee649 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu @@ -2,10 +2,10 @@ ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. ! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend. !========================================================================== -! Copyright (C) 2020-2024 CERN and UCLouvain. +! Copyright (C) 2020-2025 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: O. Mattelaer (Mar 2020) for the MG5aMC CUDACPP plugin. -! Further modified by: O. Mattelaer, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +! Further modified by: O. Mattelaer, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. !========================================================================== //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h index fcfc4b3153..67b3ba40c4 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h @@ -2,10 +2,10 @@ ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. ! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend. !========================================================================== -! Copyright (C) 2020-2024 CERN and UCLouvain. +! Copyright (C) 2020-2025 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: O. Mattelaer (Mar 2020) for the MG5aMC CUDACPP plugin. -! Further modified by: O. Mattelaer, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +! Further modified by: O. Mattelaer, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. !========================================================================== //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index b1739da73d..262d39a736 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. # Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc index 3f2f65688f..1e2905dad8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 77538b7e1c..a7b1c6f9fd 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -217,7 +217,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index 49b928db67..a45024704a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -1,4 +1,4 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. // Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h index 6a4b946e74..086aa6a616 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h @@ -1,4 +1,4 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 703ea3781c..a68ae314eb 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. // Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. @@ -166,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -193,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -208,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -220,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -314,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -341,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -363,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -385,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -403,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 8da04d7945..c901874333 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -1,4 +1,4 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. // Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. @@ -8,9 +8,12 @@ #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h index 73ce5b3325..007485ea58 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index deddc425f5..47e8acbcfa 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_%(model_name)s_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc new file mode 100644 index 0000000000..9e8360023b --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc @@ -0,0 +1,374 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** +%(color_matrix_lines)s + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc index cace65e4b8..aa32fdc2a6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc @@ -2,10 +2,10 @@ ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. ! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend. !========================================================================== -! Copyright (C) 2020-2024 CERN and UCLouvain. +! Copyright (C) 2020-2025 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +! Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. !========================================================================== %(function_definitions)s } // end namespace diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc index 7175e85bb2..006405432d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc @@ -2,10 +2,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for %(output_name)s by %(info_lines)s diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 32d12a5bba..22acd3abe9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -479,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -599,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -782,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -801,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -834,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -878,6 +931,7 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 @@ -979,6 +1033,7 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk index 4934c9a53f..1f9f8bbc46 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_boilerplate.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_h.inc new file mode 100644 index 0000000000..7aee564187 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_h.inc @@ -0,0 +1,11 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- +%(code)s + +/* clang-format on */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 68bbf1b934..dd695e591a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 444c848e10..43fa9db7d8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -14,6 +14,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" %(hel_amps_h)s #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -23,6 +24,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc index 4e5e942a41..6d9568490d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc @@ -1,10 +1,10 @@ ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend. !========================================================================== -! Copyright (C) 2020-2024 CERN and UCLouvain. +! Copyright (C) 2020-2025 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +! Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. !========================================================================== //========================================================================== // A class for calculating the matrix elements for @@ -50,17 +50,17 @@ static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = %(nbhel)d; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = %(ndiagrams)d; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = %(ncolor)s; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = __NWF__; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = %(nexternal)d; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = %(nwavefuncs)d; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = %(namp)d; //static const int ncomb = %(ncomb)d; // CPPProcess::ncomb diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 76b6e773bd..043f311587 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -1,7 +1,7 @@ ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend. !========================================================================== -! Copyright (C) 2020-2024 CERN and UCLouvain. +! Copyright (C) 2020-2025 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. ! Further modified by: J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. @@ -16,20 +16,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_%(model_name)s_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_%(model_name)s_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = %(ncolor)s; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -88,12 +84,58 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- @@ -117,8 +159,10 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif + + // Enable SIGFPE traps for Floating Point Exceptions #ifdef MGONGPUCPP_DEBUG - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + fpeEnable(); #endif } @@ -148,6 +192,10 @@ namespace mg5amcCpu //m_pars->printDependentCouplings(); // now computed event-by-event (running alphas #373) } %(initProc_lines)s +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory %(cipdassign)s @@ -183,6 +231,10 @@ namespace mg5amcCpu //Parameters_%(model_name)s::printDependentCouplings(); // now computed event-by-event (running alphas #373) } %(hardcoded_initProc_lines)s +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -285,26 +337,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -312,25 +364,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%%4d ihel=%%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%%d icol=%%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -466,22 +722,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = %(nproc)i; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = %(proc_id)i; // code generation source: %(proc_id_source)s + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 895b7ec1d6..a9bd9c7728 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -17,6 +17,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_%(model_name)s.h" #include @@ -46,23 +47,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -76,34 +80,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 2700d7e7da..e0603558aa 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -1,152 +1,15 @@ ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend. !========================================================================== -! Copyright (C) 2020-2024 CERN and UCLouvain. +! Copyright (C) 2020-2025 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. !========================================================================== + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?) -%(color_matrix_lines)s - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; -#else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); -#endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; -#else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); -#endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); -#else - deltaMEs += deltaMEs2; -#endif - // === C++ END === -#else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === -#endif - } - - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* -#ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", ievt0, ihel, MEs_sv ); -#endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); return; } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index d49047a623..32b552d101 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -1,22 +1,25 @@ ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend. !========================================================================== -! Copyright (C) 2020-2024 CERN and UCLouvain. +! Copyright (C) 2020-2025 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +! Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. !========================================================================== - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -42,93 +45,63 @@ #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -170,7 +143,7 @@ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -193,7 +166,7 @@ // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -202,25 +175,31 @@ } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -230,8 +209,10 @@ for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -247,11 +228,12 @@ //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt2, ihelF ); break; } } @@ -353,14 +335,15 @@ #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 554c97974b..ec4c6fab01 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. import os import sys @@ -211,7 +211,7 @@ def get_header_txt(self, name=None, couplings=None,mode=''): output = '%(doublec)s allvertexes[]' % { 'doublec': self.type2def['double']} comment_output = 'amplitude \'vertex\'' - template = 'template' + template = 'template' else: output = '%(doublec)s all%(spin)s%(id)d[]' % { 'doublec': self.type2def['double'], @@ -219,7 +219,7 @@ def get_header_txt(self, name=None, couplings=None,mode=''): 'id': self.outgoing} ###self.declaration.add(('list_complex', output)) # AV BUG FIX - THIS IS NOT NEEDED AND IS WRONG (adds name 'cxtype_sv V3[]') comment_output = 'wavefunction \'%s%d[6]\'' % ( self.particles[self.outgoing -1], self.outgoing ) # AV (wavefuncsize=6) - template = 'template' + template = 'template' comment = '// Compute the output %s from the input wavefunctions %s' % ( comment_output, ', '.join(comment_inputs) ) # AV indent = ' ' * len( ' %s( ' % name ) out.write(' %(comment)s\n %(template)s\n %(prefix)s void\n %(name)s( const %(args)s,\n%(indent)s%(output)s )%(suffix)s' % @@ -258,7 +258,7 @@ def get_declaration_txt(self, add_i=True): if type.startswith('list'): out.write(' const %s* %s = W_ACCESS::kernelAccessConst( all%s );\n' % ( self.type2def[type[5:]+'_v'], name, name ) ) if name.startswith('COUP'): # AV from cxtype_sv to fptype array (running alphas #373) - out.write(' const cxtype_sv %s = C_ACCESS::kernelAccessConst( all%s );\n' % ( name, name ) ) + out.write(' const cxtype_sv %s = CD_ACCESS::kernelAccessConst( all%s );\n' % ( name, name ) ) if not self.offshell: vname = 'vertex' access = 'A_ACCESS' @@ -961,9 +961,9 @@ def super_generate_parameters_class_files(self): replace_dict['dcoupsetdpar'] = '\n'.join( dcoupsetdpar ) dcoupsetdcoup = [ ' ' + line.replace('constexpr cxsmpl ','out.').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ] replace_dict['dcoupsetdcoup'] = ' ' + '\n'.join( dcoupsetdcoup ) - dcoupaccessbuffer = [ ' fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ] + dcoupaccessbuffer = [ ' fptype* %ss = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupaccessbuffer'] = '\n'.join( dcoupaccessbuffer ) + '\n' - dcoupkernelaccess = [ ' cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ] + dcoupkernelaccess = [ ' cxtype_sv_ref %ss_sv = CD_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupkernelaccess'] = '\n'.join( dcoupkernelaccess ) + '\n' dcoupcompute = [ ' %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupcompute'] = '\n'.join( dcoupcompute ) @@ -1300,52 +1300,62 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): color_amplitudes[0], multi_channel_map = multi_channel ) + self.diagram_code = self.helas_call_writer.diagram_code # hack? get code in helascallwriter, write it to diagrams.h in oneprocessexporter ###misc.sprint( 'after get_matrix_element_calls', self.matrix_elements[0].get_number_of_wavefunctions() ) # CORRECT value of nwf, eg 5 for gg_tt assert len(self.matrix_elements) == 1 # how to handle if this is not true? self.couplings2order = self.helas_call_writer.couplings2order self.params2order = self.helas_call_writer.params2order ret_lines.append(""" - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#include \"diagrams.h\" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -1353,50 +1363,124 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; + + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- #ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\\n", ihel );""") - nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() - ret_lines.append(""" - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = %i; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)"""%nwavefuncs ) - ret_lines.append(""" // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; -#endif""") +#endif + + // ----------------- + // --- COUPLINGS --- + // ----------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; +#else + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; + const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; +#endif + + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); +#endif + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); +#endif + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; +#else + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; +#endif +#else + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; +#endif + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; +#else + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif +#else + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; +#endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------""") + self.nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() # this was for nwf in CPPProcess.cc: now keep it for CPPProcess.h ret_lines += helas_calls else: ret_lines.extend([self.get_sigmaKin_single_process(i, me) \ @@ -1410,6 +1494,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): file = self.get_matrix_single_process( i, me, color_amplitudes[i], class_name ) file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_matrix.inc (copyright) file_extend.append( file ) + assert i == 0, "more than one ME in get_all_sigmaKin_lines" # AV sanity check (added for color_sum.cc but valid independently) ret_lines.extend( file_extend ) return '\n'.join(ret_lines) @@ -1439,7 +1524,7 @@ def generate_process_files(self): self.edit_check_sa() self.edit_mgonGPU() self.edit_processidfile() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses) - + self.edit_colorsum() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses) self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) self.edit_memoryaccesscouplings() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) @@ -1459,6 +1544,12 @@ def generate_process_files(self): PLUGIN_export_cpp.cp( ref, self.path + '/../../test/ref' ) ###else: ###misc.sprint( 'Test reference file does not exist and will not be copied: ', ref ) + # Set the value of nwf in CPPProcess.h after generating CPPProcess.cc (workaround for #644) + cppprocess_h = os.path.join(self.path, self.include_dir, '%s.h' % self.process_class) + with open(cppprocess_h, 'r') as file: data = file.read().replace('__NWF__', '%d'%self.nwavefuncs) + with open(cppprocess_h, 'w') as file: file.write(data) + # Generate diagram headers after generating CPPProcess.cc + self.edit_diagrams(self.diagram_code) # SR - generate CMakeLists.txt file inside the P* directory def edit_CMakeLists(self): @@ -1518,6 +1609,28 @@ def edit_processidfile(self): ff.write(template % replace_dict) ff.close() + # AV - new method + def edit_colorsum(self): + """Generate color_sum.cc""" + ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_colorsum') + template = open(pjoin(self.template_path,'gpu','color_sum.cc'),'r').read() + replace_dict = {} + # Extract color matrix again (this was also in get_matrix_single_process called within get_all_sigmaKin_lines) + replace_dict['color_matrix_lines'] = self.get_color_matrix_lines(self.matrix_elements[0]) + ff = open(pjoin(self.path, 'color_sum.cc'),'w') + ff.write(template % replace_dict) + ff.close() + + # AV - new method + def edit_diagrams(self, diagrams): + """Generate diagrams.h""" + ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_diagrams') + template = open(pjoin(self.template_path,'gpu','diagram_h.inc'),'r').read() + replace_dict = {} + replace_dict['code'] = ''.join(diagrams) # all diagrams to a single file + ff = open(pjoin(self.path, 'diagrams.h'),'w') + ff.write(template % replace_dict) + ff.close() def generate_subprocess_directory_end(self, **opt): """ opt contain all local variable of the fortran original function""" @@ -1688,11 +1801,11 @@ def get_color_matrix_lines(self, matrix_element): """Return the color matrix definition lines for this matrix element. Split rows in chunks of size n.""" import madgraph.core.color_algebra as color if not matrix_element.get('color_matrix'): - return '\n'.join([' static constexpr fptype2 denom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};']) + return '\n'.join([' static constexpr fptype2 colorDenom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};']) else: color_denominators = matrix_element.get('color_matrix').\ get_line_denominators() - denom_string = ' static constexpr fptype2 denom[ncolor] = { %s }; // 1-D array[%i]' \ + denom_string = ' static constexpr fptype2 colorDenom[ncolor] = { %s }; // 1-D array[%i]' \ % ( ', '.join(['%i' % denom for denom in color_denominators]), len(color_denominators) ) matrix_strings = [] my_cs = color.ColorString() @@ -1700,12 +1813,12 @@ def get_color_matrix_lines(self, matrix_element): # Then write the numerators for the matrix elements num_list = matrix_element.get('color_matrix').get_line_numerators(index, denominator) matrix_strings.append('{ %s }' % ', '.join(['%d' % i for i in num_list])) - matrix_string = ' static constexpr fptype2 cf[ncolor][ncolor] = ' - if len( matrix_strings ) > 1 : matrix_string += '{\n ' + ',\n '.join(matrix_strings) + ' };' + matrix_string = ' static constexpr fptype2 colorMatrix[ncolor][ncolor] = ' + if len( matrix_strings ) > 1 : matrix_string += '{\n ' + ',\n '.join(matrix_strings) + ' };' else: matrix_string += '{ ' + matrix_strings[0] + ' };' matrix_string += ' // 2-D array[%i][%i]' % ( len(color_denominators), len(color_denominators) ) - denom_comment = '\n // The color denominators (initialize all array elements, with ncolor=%i)\n // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators) - matrix_comment = '\n // The color matrix (initialize all array elements, with ncolor=%i)\n // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators) + denom_comment = '\n // The color denominators (initialize all array elements, with ncolor=%i)\n // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators) + matrix_comment = '\n // The color matrix (initialize all array elements, with ncolor=%i)\n // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators) denom_string = denom_comment + denom_string matrix_string = matrix_comment + matrix_string return '\n'.join([denom_string, matrix_string]) @@ -1857,6 +1970,103 @@ def format_coupling(self, call): def format_call(call): return call.replace('(','( ').replace(')',' )').replace(',',', ') + # AV - new method + def get_one_diagram_code(self, diagram, id_amp, multi_channel_map, diag_to_config, color, ndiagrams): + res = [] + idiagram = diagram.get('number') + ###print('DIAGRAM %3d: #wavefunctions=%3d, #diagrams=%3d' % + ### (diagram.get('number'), len(diagram.get('wavefunctions')), len(diagram.get('amplitudes')) )) # AV - FOR DEBUGGING + # 1 - Header + if idiagram == 1: + res.append(""" + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include \"diagram_boilerplate.h\" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif""") + else: + sidiag = '%i'%idiagram + indent = ' '*(len(sidiag)-1) + res.append(""" + + __global__ void + diagram%s( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] +%s fptype* jamps, // output jamps[ncolor*2*nevtORneppV] +%s const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL +%s const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else +%s const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif +%s fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel +%s fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include \"diagram_boilerplate.h\""""%(sidiag,indent,indent,indent,indent,indent,indent)) + # 2 - Core code + res.append(' // *** DIAGRAM %i OF %i ***' % ( idiagram, ndiagrams ) ) # AV + res.append(' // Wavefunction(s) for diagram number %d' % idiagram) # AV + for wf in diagram.get('wavefunctions'): + wfline = ' '+self.get_wavefunction_call(wf) # AV new: add formatting + if wfline[-1] == '\n': wfline = wfline[:-1] + res.append( wfline ) + if len(diagram.get('wavefunctions')) == 0 : res.append(' // (none)') # AV + res.append(' // Amplitude(s) for diagram number %d' % idiagram) + for amplitude in diagram.get('amplitudes'): + id_amp +=1 + namp = amplitude.get('number') + amplitude.set('number', 1) + res.append(' '+self.get_amplitude_call(amplitude)) # AV new: add formatting + if multi_channel_map: # different code bases #473 (assume this is the same as self.include_multi_channel...) + if id_amp in diag_to_config: + ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472 + ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472 + res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") + res.append(" if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % idiagram) + res.append(" if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );") + res.append("#endif") + else: + res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") + res.append(" // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)") + res.append("#endif") + for njamp, coeff in color[namp].items(): + scoeff = PLUGIN_OneProcessExporter.coeff(*coeff) # AV + if scoeff[0] == '+' : scoeff = scoeff[1:] + scoeff = scoeff.replace('(','( ') + scoeff = scoeff.replace(')',' )') + scoeff = scoeff.replace(',',', ') + scoeff = scoeff.replace('*',' * ') + scoeff = scoeff.replace('/',' / ') + if scoeff.startswith('-'): + res.append(' J_ACCESS::kernelAccessIcol( jamps, %s ) -= %samp_sv[0];' % (njamp, scoeff[1:])) + else: + res.append(' J_ACCESS::kernelAccessIcol( jamps, %s ) += %samp_sv[0];' % (njamp, scoeff)) + if len(diagram.get('amplitudes')) == 0 : res.append(' // (none)') # AV + # 3 - Footer + res.append(""" } + + //--------------------------------------------------------------------------""") + # Return + return res, id_amp + # AV - replace helas_call_writers.GPUFOHelasCallWriter method (improve formatting) def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi_channel_map=False): """Return a list of strings, corresponding to the Helas calls for the matrix element""" @@ -1880,55 +2090,6 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi ###misc.sprint(multi_channel_map) res = [] ###res.append('for(int i=0;i<%s;i++){jamp[i] = cxtype(0.,0.);}' % len(color_amplitudes)) - res.append("""//constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events -#ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif -#else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); - const fptype* COUPs[nxcoup]; - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif""") diagrams = matrix_element.get('diagrams') diag_to_config = {} if multi_channel_map: @@ -1938,45 +2099,22 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi idiag in multi_channel_map[config]], [])] diag_to_config[amp[0]] = config ###misc.sprint(diag_to_config) + res.append('\n // *** DIAGRAMS 1 TO %d ***' % (len(matrix_element.get('diagrams'))) ) # AV + res.append('#ifdef MGONGPUCPP_GPUIMPL') + for idiagram in range(1,len(matrix_element.get('diagrams'))+1): + if idiagram == 1: res.append('gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );') + else: res.append('gpuLaunchKernelStream( diagram%i, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );'%idiagram) + res.append('#else') + for idiagram in range(1,len(matrix_element.get('diagrams'))+1): + if idiagram == 1: res.append('diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );') + else: res.append('diagram%i( wfs, jamps, channelIds, COUPs, numerators, denominators );'%idiagram) + res.append('#endif') + # Generate diagram code + self.diagram_code = [] id_amp = 0 for diagram in matrix_element.get('diagrams'): - ###print('DIAGRAM %3d: #wavefunctions=%3d, #diagrams=%3d' % - ### (diagram.get('number'), len(diagram.get('wavefunctions')), len(diagram.get('amplitudes')) )) # AV - FOR DEBUGGING - res.append('\n // *** DIAGRAM %d OF %d ***' % (diagram.get('number'), len(matrix_element.get('diagrams'))) ) # AV - res.append('\n // Wavefunction(s) for diagram number %d' % diagram.get('number')) # AV - res.extend([ self.get_wavefunction_call(wf) for wf in diagram.get('wavefunctions') ]) # AV new: avoid format_call - if len(diagram.get('wavefunctions')) == 0 : res.append('// (none)') # AV - if res[-1][-1] == '\n' : res[-1] = res[-1][:-1] - res.append('\n // Amplitude(s) for diagram number %d' % diagram.get('number')) - for amplitude in diagram.get('amplitudes'): - id_amp +=1 - namp = amplitude.get('number') - amplitude.set('number', 1) - res.append(self.get_amplitude_call(amplitude)) # AV new: avoid format_call - if multi_channel_map: # different code bases #473 (assume this is the same as self.include_multi_channel...) - if id_amp in diag_to_config: - ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472 - ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472 - res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") - res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diagram.get('number')) - res.append("if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );") - res.append("#endif") - else: - res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") - res.append("// Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)") - res.append("#endif") - for njamp, coeff in color[namp].items(): - scoeff = PLUGIN_OneProcessExporter.coeff(*coeff) # AV - if scoeff[0] == '+' : scoeff = scoeff[1:] - scoeff = scoeff.replace('(','( ') - scoeff = scoeff.replace(')',' )') - scoeff = scoeff.replace(',',', ') - scoeff = scoeff.replace('*',' * ') - scoeff = scoeff.replace('/',' / ') - if scoeff.startswith('-'): res.append('jamp_sv[%s] -= %samp_sv[0];' % (njamp, scoeff[1:])) # AV - else: res.append('jamp_sv[%s] += %samp_sv[0];' % (njamp, scoeff)) # AV - if len(diagram.get('amplitudes')) == 0 : res.append('// (none)') # AV - ###res.append('\n // *** END OF DIAGRAMS ***' ) # AV - no longer needed ('COLOR MATRIX BELOW') + res_diagram, id_amp = self.get_one_diagram_code(diagram, id_amp, multi_channel_map, diag_to_config, color, len(matrix_element.get('diagrams'))) + self.diagram_code.append( '\n'.join(res_diagram) ) return res # AV - overload helas_call_writers.GPUFOHelasCallWriter method (improve formatting) @@ -2148,8 +2286,8 @@ def generate_helas_call(self, argument): if usesdepcoupl is None: raise Exception('PANIC! could not determine if this call uses aS-dependent or aS-independent couplings?') elif usesdepcoupl: caccess = 'CD_ACCESS' else: caccess = 'CI_ACCESS' - ###if arg['routine_name'].endswith( '_0' ) : arg['routine_name'] += '' - ###else : arg['routine_name'] += '' + ###if arg['routine_name'].endswith( '_0' ) : arg['routine_name'] += '' + ###else : arg['routine_name'] += '' if arg['routine_name'].endswith( '_0' ) : arg['routine_name'] += ''%caccess else : arg['routine_name'] += ''%caccess if isinstance(argument, helas_objects.HelasWavefunction): diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 209f088314..ff0d1f10d4 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. import os import sys @@ -103,6 +103,8 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): s+'CMake/src/CMakeLists.txt' ], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', + s+'gpu/color_sum.h', + s+'gpu/diagram_boilerplate.h', s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h', s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h', s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h', @@ -126,6 +128,8 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', + 'color_sum.h', + 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', @@ -222,6 +226,7 @@ def generate_subprocess_directory(self, subproc_group, fortran_model, me=None): misc.sprint("need to link", self.to_link_in_P) out = super().generate_subprocess_directory(subproc_group, fortran_model, me) return out + # AV (default from OM's tutorial) - add a debug printout def convert_model(self, model, wanted_lorentz=[], wanted_coupling=[]): if hasattr(model , 'cudacpp_wanted_ordered_couplings'): diff --git a/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh b/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh index 097935efc8..acb186f448 100755 --- a/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh @@ -1,45 +1,76 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Oct 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. set -e # fail on error cd $(dirname $0)/.. -./CODEGEN/generateAndCompare.sh -q ee_mumu -./CODEGEN/generateAndCompare.sh -q ee_mumu --mad +bsm= +while [ "$1" != "" ]; do + if [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then + bsm=$1 + shift + elif [ "$1" == "-nobsm" ] && [ "$bsm" != "-bsmonly" ]; then + bsm=$1 + shift + else + echo "Usage: $0 [-bsmonly|-nobsm]" + fi +done -./CODEGEN/generateAndCompare.sh -q gg_tt -./CODEGEN/generateAndCompare.sh -q gg_tt --mad +# SM processes (both mad and sa) -./CODEGEN/generateAndCompare.sh -q gg_ttg -./CODEGEN/generateAndCompare.sh -q gg_ttg --mad +if [ "${bsm}" != "-bsmonly" ]; then -./CODEGEN/generateAndCompare.sh -q gg_ttgg -./CODEGEN/generateAndCompare.sh -q gg_ttgg --mad + ./CODEGEN/generateAndCompare.sh -q ee_mumu + ./CODEGEN/generateAndCompare.sh -q ee_mumu --mad -./CODEGEN/generateAndCompare.sh -q gg_ttggg -./CODEGEN/generateAndCompare.sh -q gg_ttggg --mad + ./CODEGEN/generateAndCompare.sh -q gg_tt + ./CODEGEN/generateAndCompare.sh -q gg_tt --mad -./CODEGEN/generateAndCompare.sh -q gq_ttq -./CODEGEN/generateAndCompare.sh -q gq_ttq --mad + ./CODEGEN/generateAndCompare.sh -q gg_ttg + ./CODEGEN/generateAndCompare.sh -q gg_ttg --mad -./CODEGEN/generateAndCompare.sh -q heft_gg_bb -./CODEGEN/generateAndCompare.sh -q heft_gg_bb --mad + ./CODEGEN/generateAndCompare.sh -q gg_ttgg + ./CODEGEN/generateAndCompare.sh -q gg_ttgg --mad -./CODEGEN/generateAndCompare.sh -q susy_gg_tt -./CODEGEN/generateAndCompare.sh -q susy_gg_tt --mad + ./CODEGEN/generateAndCompare.sh -q gg_ttggg + ./CODEGEN/generateAndCompare.sh -q gg_ttggg --mad -./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 -./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 --mad + ./CODEGEN/generateAndCompare.sh -q gq_ttq + ./CODEGEN/generateAndCompare.sh -q gq_ttq --mad -./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt -./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt --mad +fi -./CODEGEN/generateAndCompare.sh -q nobm_pp_ttW --mad +# BSM processes -./CODEGEN/generateAndCompare.sh -q gg_tt01g --mad +if [ "${bsm}" != "-nobsm" ]; then -./CODEGEN/generateAndCompare.sh -q pp_tt012j --mad + ./CODEGEN/generateAndCompare.sh -q heft_gg_bb + ./CODEGEN/generateAndCompare.sh -q heft_gg_bb --mad + + ./CODEGEN/generateAndCompare.sh -q susy_gg_tt + ./CODEGEN/generateAndCompare.sh -q susy_gg_tt --mad + + ./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 + ./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 --mad + + ./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt + ./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt --mad + + ./CODEGEN/generateAndCompare.sh -q nobm_pp_ttW --mad + +fi + +# SM processes (mad only) + +if [ "${bsm}" != "-bsmonly" ]; then + + ./CODEGEN/generateAndCompare.sh -q gg_tt01g --mad + + ./CODEGEN/generateAndCompare.sh -q pp_tt012j --mad + +fi diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index 6221b1cfee..8e36c0eb3e 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -1,8 +1,8 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. set -e # fail on error @@ -358,10 +358,12 @@ function codeGenAndDiff() fi done fi - # Remove card.jpg, diagrams.html and matrix*.jpg files (NB: these are only created if ghostscript is installed) + # Remove card.jpg/png, diagrams.html and matrix*.jpg/png files (NB: these are only created if ghostscript is installed) \rm -f ${outproc}/SubProcesses/P*/card.jpg + \rm -f ${outproc}/SubProcesses/P*/card.png \rm -f ${outproc}/SubProcesses/P*/diagrams.html \rm -f ${outproc}/SubProcesses/P*/matrix*jpg + \rm -f ${outproc}/SubProcesses/P*/matrix*png # Cleanup \rm -f ${outproc}/crossx.html \rm -f ${outproc}/index.html diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index db84a9053c..3226606748 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006200551986694336  +DEBUG: model prefixing takes 0.005434751510620117  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,16 +155,16 @@ Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -176,22 +176,22 @@ FileWriter mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1665]  Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.070 s +Wrote files for 8 helas calls in 0.073 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.201 s +ALOHA: aloha creates 3 routines in 0.194 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.253 s +ALOHA: aloha creates 7 routines in 0.243 s FFV1 FFV1 FFV2 @@ -200,37 +200,37 @@ ALOHA: aloha creates 7 routines in 0.253 s FFV4 FFV2_4 FFV2_4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 236 (offset 9 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m2.054s -user 0m1.767s -sys 0m0.275s +real 0m2.060s +user 0m1.796s +sys 0m0.262s Code generation completed in 2 seconds ************************************************************ * * @@ -244,7 +244,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -252,9 +252,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -274,7 +274,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -282,9 +282,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat index bb623f867a..7f8313745d 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat index 74f70b567b..c1037c83d7 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat @@ -112,6 +112,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat index 68ee164d00..4ba7540657 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat @@ -112,6 +112,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt +++ b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/Source/.make_opts b/epochX/cudacpp/ee_mumu.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/.make_opts +++ b/epochX/cudacpp/ee_mumu.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f b/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc b/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc +++ b/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts +++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/ee_mumu.mad/Source/makefile b/epochX/cudacpp/ee_mumu.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/makefile +++ b/epochX/cudacpp/ee_mumu.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc b/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc index 80d5ae41aa..83061d9ae9 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc +++ b/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 7bd57a8dbb..cad6526137 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 1; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,281 +279,137 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) + + // ----------------- + // --- COUPLINGS --- + // ----------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; +#else + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif + const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events -#ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // CUDA kernels take input/output buffers with momenta/MEs for all events + + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); - const fptype* COUPs[nxcoup]; - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif #endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - // *** DIAGRAM 1 OF 2 *** - - // Wavefunction(s) for diagram number 1 - oxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - ixxxxx( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 ); - - oxxxxx( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 ); - - FFV1P0_3( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 2 OF 2 *** - - // Wavefunction(s) for diagram number 2 - FFV2_4_3( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV2_4_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_epem_mupmum()?) - - // The color denominators (initialize all array elements, with ncolor=1) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1 }; // 1-D array[1] - - // The color matrix (initialize all array elements, with ncolor=1) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 2 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -539,7 +448,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -572,6 +485,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; @@ -613,6 +530,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -715,26 +636,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -742,25 +663,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -898,20 +1023,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -923,17 +1042,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -959,93 +1081,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1087,7 +1179,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1110,7 +1202,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1119,25 +1211,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1147,8 +1245,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1164,11 +1264,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1270,14 +1371,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index 159826a904..50da8f60b2 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 1; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 2; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f index 70fe04e4d8..27da02d9c0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f index 280eff025e..a453b7c2b6 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF EP1=PDG2PDF(LPP(IB(1)),-11, IB(1),XBK(IB(1)), QSCALE) IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4 ) = @@ -149,7 +149,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF EM2=PDG2PDF(LPP(IB(2)),11, IB(2),XBK(IB(2)), QSCALE) IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4 ) = @@ -228,7 +228,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -302,6 +302,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -385,14 +389,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) EP1(IVEC)=PDG2PDF(LPP(IB(1)),-11, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4 , IVEC) = $ EE_COMPONENTS(1:4) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) EM2(IVEC)=PDG2PDF(LPP(IB(2)),11, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4 , IVEC) = $ EE_COMPONENTS(1:4) ENDIF @@ -502,6 +506,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc new file mode 100644 index 0000000000..8fbdb5c7fb --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc @@ -0,0 +1,381 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=1) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1 }; // 1-D array[1] + + // The color matrix (initialize all array elements, with ncolor=1) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagram_boilerplate.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagrams.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagrams.h new file mode 100644 index 0000000000..2fca11bf87 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagrams.h @@ -0,0 +1,79 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 2 *** + // Wavefunction(s) for diagram number 1 + oxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + ixxxxx( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 ); + oxxxxx( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 ); + FFV1P0_3( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 2 *** + // Wavefunction(s) for diagram number 2 + FFV2_4_3( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV2_4_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f index ec5722702a..c08048ad0e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f index 1a2e5df4e6..ad813c359c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -325,7 +325,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -368,7 +368,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -411,17 +412,22 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 1) /1.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 1) /1/ C 1 ColorOne() C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WZ.NE.0D0) FK_MDL_WZ = SIGN(MAX(ABS(MDL_WZ), ABS(MDL_MZ - $ *SMALL_WIDTH_TREATMENT)), MDL_WZ) + FK_ZERO = 0D0 + IF(MDL_WZ.NE.0D0) THEN + FK_MDL_WZ = SIGN(MAX(ABS(MDL_WZ), ABS(MDL_MZ + $ *SMALL_WIDTH_TREATMENT)), MDL_WZ) + ELSE + FK_MDL_WZ = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -455,10 +461,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -467,6 +475,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/cluster.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data b/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/ee_mumu.mad/bin/madevent b/epochX/cudacpp/ee_mumu.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/madevent +++ b/epochX/cudacpp/ee_mumu.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h index 18f664e0d1..a5438a65b0 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV2_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV2_3( const fptype allF1[], const fptype allF2[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV4_0( const fptype allF1[], const fptype allF2[], @@ -922,7 +922,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV4_3( const fptype allF1[], const fptype allF2[], @@ -935,7 +935,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV2_4_0( const fptype allF1[], const fptype allF2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV2_4_3( const fptype allF1[], const fptype allF2[], @@ -964,7 +964,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -977,7 +977,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -989,7 +989,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1002,7 +1002,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1020,7 +1020,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV2_0( const fptype allF1[], const fptype allF2[], @@ -1033,7 +1033,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) ); @@ -1045,7 +1045,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV2_3( const fptype allF1[], const fptype allF2[], @@ -1058,7 +1058,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. ); @@ -1078,7 +1078,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV4_0( const fptype allF1[], const fptype allF2[], @@ -1091,7 +1091,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); constexpr fptype one( 1. ); @@ -1106,7 +1106,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV4_3( const fptype allF1[], const fptype allF2[], @@ -1119,7 +1119,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. ); @@ -1142,7 +1142,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV2_4_0( const fptype allF1[], const fptype allF2[], @@ -1157,8 +1157,8 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP1 = C_ACCESS::kernelAccessConst( allCOUP1 ); - const cxtype_sv COUP2 = C_ACCESS::kernelAccessConst( allCOUP2 ); + const cxtype_sv COUP1 = CD_ACCESS::kernelAccessConst( allCOUP1 ); + const cxtype_sv COUP2 = CD_ACCESS::kernelAccessConst( allCOUP2 ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); constexpr fptype one( 1. ); @@ -1173,7 +1173,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV2_4_3( const fptype allF1[], const fptype allF2[], @@ -1188,8 +1188,8 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP1 = C_ACCESS::kernelAccessConst( allCOUP1 ); - const cxtype_sv COUP2 = C_ACCESS::kernelAccessConst( allCOUP2 ); + const cxtype_sv COUP1 = CD_ACCESS::kernelAccessConst( allCOUP1 ); + const cxtype_sv COUP2 = CD_ACCESS::kernelAccessConst( allCOUP2 ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. ); diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc index 37676c1d8d..68296642b5 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index 5fcde71f6b..edabf077ce 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -283,7 +283,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index f27925604a..da149d8161 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006340742111206055  +DEBUG: model prefixing takes 0.005414009094238281  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,28 +154,28 @@ INFO: Process has 2 diagrams Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.267 s +ALOHA: aloha creates 4 routines in 0.259 s FFV1 FFV1 FFV2 @@ -184,17 +184,17 @@ ALOHA: aloha creates 4 routines in 0.267 s FFV4 FFV2_4 FFV2_4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.659s -user 0m0.589s -sys 0m0.056s +real 0m0.701s +user 0m0.597s +sys 0m0.047s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 16a91dd141..cad6526137 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 1; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,279 +279,137 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) + + // ----------------- + // --- COUPLINGS --- + // ----------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; +#else + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif + const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events -#ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // CUDA kernels take input/output buffers with momenta/MEs for all events + + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); - const fptype* COUPs[nxcoup]; - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif #endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - // *** DIAGRAM 1 OF 2 *** - - // Wavefunction(s) for diagram number 1 - oxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - ixxxxx( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 ); - - oxxxxx( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 ); - - FFV1P0_3( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 2 OF 2 *** - - // Wavefunction(s) for diagram number 2 - FFV2_4_3( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV2_4_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_epem_mupmum()?) - - // The color denominators (initialize all array elements, with ncolor=1) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1 }; // 1-D array[1] - - // The color matrix (initialize all array elements, with ncolor=1) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 2 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -537,7 +448,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -570,6 +485,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; @@ -611,6 +530,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -713,26 +636,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -740,25 +663,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -903,13 +1030,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -921,17 +1042,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -957,93 +1081,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1085,7 +1179,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1108,7 +1202,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1117,25 +1211,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1145,8 +1245,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1162,11 +1264,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1268,14 +1371,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index 159826a904..50da8f60b2 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 1; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 2; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc new file mode 100644 index 0000000000..8fbdb5c7fb --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc @@ -0,0 +1,381 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=1) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1 }; // 1-D array[1] + + // The color matrix (initialize all array elements, with ncolor=1) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagram_boilerplate.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagrams.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagrams.h new file mode 100644 index 0000000000..f72dba0545 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagrams.h @@ -0,0 +1,77 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 2 *** + // Wavefunction(s) for diagram number 1 + oxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + ixxxxx( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 ); + oxxxxx( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 ); + FFV1P0_3( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 2 *** + // Wavefunction(s) for diagram number 2 + FFV2_4_3( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV2_4_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h index 18f664e0d1..a5438a65b0 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV2_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV2_3( const fptype allF1[], const fptype allF2[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV4_0( const fptype allF1[], const fptype allF2[], @@ -922,7 +922,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV4_3( const fptype allF1[], const fptype allF2[], @@ -935,7 +935,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV2_4_0( const fptype allF1[], const fptype allF2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV2_4_3( const fptype allF1[], const fptype allF2[], @@ -964,7 +964,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -977,7 +977,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -989,7 +989,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1002,7 +1002,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1020,7 +1020,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV2_0( const fptype allF1[], const fptype allF2[], @@ -1033,7 +1033,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) ); @@ -1045,7 +1045,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV2_3( const fptype allF1[], const fptype allF2[], @@ -1058,7 +1058,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. ); @@ -1078,7 +1078,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV4_0( const fptype allF1[], const fptype allF2[], @@ -1091,7 +1091,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); constexpr fptype one( 1. ); @@ -1106,7 +1106,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV4_3( const fptype allF1[], const fptype allF2[], @@ -1119,7 +1119,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. ); @@ -1142,7 +1142,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV2_4_0( const fptype allF1[], const fptype allF2[], @@ -1157,8 +1157,8 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP1 = C_ACCESS::kernelAccessConst( allCOUP1 ); - const cxtype_sv COUP2 = C_ACCESS::kernelAccessConst( allCOUP2 ); + const cxtype_sv COUP1 = CD_ACCESS::kernelAccessConst( allCOUP1 ); + const cxtype_sv COUP2 = CD_ACCESS::kernelAccessConst( allCOUP2 ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); constexpr fptype one( 1. ); @@ -1173,7 +1173,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV2_4_3( const fptype allF1[], const fptype allF2[], @@ -1188,8 +1188,8 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP1 = C_ACCESS::kernelAccessConst( allCOUP1 ); - const cxtype_sv COUP2 = C_ACCESS::kernelAccessConst( allCOUP2 ); + const cxtype_sv COUP1 = CD_ACCESS::kernelAccessConst( allCOUP1 ); + const cxtype_sv COUP2 = CD_ACCESS::kernelAccessConst( allCOUP2 ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. ); diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc index 37676c1d8d..68296642b5 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index 5fcde71f6b..edabf077ce 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -283,7 +283,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 453da8d298..9a3da618a4 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0065233707427978516  +DEBUG: model prefixing takes 0.005475044250488281  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -156,16 +156,16 @@ Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -177,53 +177,53 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1665]  Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.072 s +Wrote files for 10 helas calls in 0.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.142 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.127 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m1.991s -user 0m1.616s -sys 0m0.275s +real 0m1.923s +user 0m1.671s +sys 0m0.251s Code generation completed in 2 seconds ************************************************************ * * @@ -237,7 +237,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -245,9 +245,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -267,7 +267,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -275,9 +275,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat index 66598786f5..8e1283453a 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat index 6b82577032..000832aacd 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat index b8db871c35..85e1d39035 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/Source/.make_opts b/epochX/cudacpp/gg_tt.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_tt.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_tt.mad/Source/cuts.inc b/epochX/cudacpp/gg_tt.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_tt.mad/Source/makefile b/epochX/cudacpp/gg_tt.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/makefile +++ b/epochX/cudacpp/gg_tt.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_tt.mad/Source/run_card.inc b/epochX/cudacpp/gg_tt.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index b32f4b931e..ede8c7f653 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,297 +279,139 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // *** DIAGRAM 1 OF 3 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 3 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= amp_sv[0]; - // *** DIAGRAM 3 OF 3 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 3 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -555,7 +450,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -588,6 +487,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -628,6 +531,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -730,26 +637,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -757,25 +664,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -913,20 +1024,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -938,17 +1043,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -974,93 +1082,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1102,7 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1125,7 +1203,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1134,25 +1212,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1162,8 +1246,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1179,11 +1265,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1285,14 +1372,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index feff1cc6e1..a55660afd2 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 3; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index bc9bcfeb9b..ce175a75a8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index db3c284caa..bd3d520785 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b96d73fb5c --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h new file mode 100644 index 0000000000..962978409f --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h @@ -0,0 +1,109 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 3 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 3 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 3 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f index ec5722702a..c08048ad0e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index 707ea40323..97656450ad 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,21 +394,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -446,10 +450,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -458,6 +464,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_tt.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/files.py b/epochX/cudacpp/gg_tt.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data b/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_tt.mad/bin/madevent b/epochX/cudacpp/gg_tt.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/madevent +++ b/epochX/cudacpp/gg_tt.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index febf1dcf42..5ca71ae17f 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -873,7 +873,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -924,7 +924,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -962,7 +962,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -974,7 +974,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -987,7 +987,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1006,7 +1006,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1019,7 +1019,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index d09f387480..e6b63799bd 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index ba434e7b98..a2c8f92751 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -300,7 +300,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -310,10 +310,10 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; mgDebug( 1, __FUNCTION__ ); diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 816b17272d..6bb0010b24 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006091594696044922  +DEBUG: model prefixing takes 0.005433559417724609  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,45 +151,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.009 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.141 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.544s -user 0m0.472s -sys 0m0.060s +real 0m0.547s +user 0m0.487s +sys 0m0.055s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 5c7a133eed..ede8c7f653 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,294 +279,139 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // *** DIAGRAM 1 OF 3 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 3 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= amp_sv[0]; - // *** DIAGRAM 3 OF 3 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 3 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -552,7 +450,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -585,6 +487,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -625,6 +531,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -727,26 +637,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -754,25 +664,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -917,13 +1031,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -935,17 +1043,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -971,93 +1082,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1099,7 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1122,7 +1203,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1131,25 +1212,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1159,8 +1246,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1176,11 +1265,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1282,14 +1372,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index feff1cc6e1..a55660afd2 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 3; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b96d73fb5c --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagrams.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagrams.h new file mode 100644 index 0000000000..173f24d4cf --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagrams.h @@ -0,0 +1,106 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 3 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 3 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 3 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h index febf1dcf42..5ca71ae17f 100644 --- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -873,7 +873,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -924,7 +924,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -962,7 +962,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -974,7 +974,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -987,7 +987,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1006,7 +1006,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1019,7 +1019,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc index d09f387480..e6b63799bd 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index ba434e7b98..a2c8f92751 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -300,7 +300,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -310,10 +310,10 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; mgDebug( 1, __FUNCTION__ ); diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 6466d14e6d..fad65df6b1 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00611424446105957  +DEBUG: model prefixing takes 0.005459308624267578  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -159,21 +159,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.019 s +1 processes with 16 diagrams generated in 0.020 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -187,9 +187,9 @@ FileWriter t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1665]  INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -198,25 +198,25 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1552]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s -Wrote files for 46 helas calls in 0.184 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1665]  +Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s +Wrote files for 46 helas calls in 0.191 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.322 s +ALOHA: aloha creates 5 routines in 0.319 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.311 s +ALOHA: aloha creates 10 routines in 0.303 s VVV1 VVV1 FFV1 @@ -226,41 +226,41 @@ ALOHA: aloha creates 10 routines in 0.311 s VVVV1 VVVV3 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P2_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P2_gg_ttxg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 243 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m2.583s -user 0m2.278s -sys 0m0.302s -Code generation completed in 3 seconds +real 0m2.626s +user 0m2.304s +sys 0m0.321s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -273,7 +273,7 @@ Code generation completed in 3 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -281,9 +281,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -303,7 +303,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -311,9 +311,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat index e50becb2d9..4b103d9e55 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat index 1711d30371..d4c7c73e61 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat @@ -125,6 +125,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat index 364dbd21b0..730a05e322 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat @@ -125,6 +125,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc b/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/makefile b/epochX/cudacpp/gg_tt01g.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/makefile +++ b/epochX/cudacpp/gg_tt01g.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc b/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc index 2588190439..e169c1f193 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index b32f4b931e..ede8c7f653 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,297 +279,139 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // *** DIAGRAM 1 OF 3 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 3 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= amp_sv[0]; - // *** DIAGRAM 3 OF 3 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 3 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -555,7 +450,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -588,6 +487,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -628,6 +531,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -730,26 +637,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -757,25 +664,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -913,20 +1024,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -938,17 +1043,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -974,93 +1082,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1102,7 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1125,7 +1203,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1134,25 +1212,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1162,8 +1246,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1179,11 +1265,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1285,14 +1372,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index feff1cc6e1..a55660afd2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 3; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index bc9bcfeb9b..ce175a75a8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index db3c284caa..bd3d520785 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b96d73fb5c --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagrams.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagrams.h new file mode 100644 index 0000000000..962978409f --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagrams.h @@ -0,0 +1,109 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 3 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 3 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 3 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f index ec5722702a..c08048ad0e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f index 707ea40323..97656450ad 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,21 +394,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -446,10 +450,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -458,6 +464,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index c8b3dbf03c..0fa180cdf8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,500 +279,165 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 16 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 2 OF 16 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 16 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 16 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 5 OF 16 *** - - // Wavefunction(s) for diagram number 5 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 16 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 7 OF 16 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 8 OF 16 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 16 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 10 OF 16 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 16 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 16 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 13 OF 16 *** - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 14 OF 16 *** - - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 15 OF 16 *** - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 16 OF 16 *** - - // Wavefunction(s) for diagram number 16 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gg_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 64, -8, -8, 1, 1, 10 }, - { -8, 64, 1, 10, -8, 1 }, - { -8, 1, 64, -8, 10, 1 }, - { 1, 10, -8, 64, 1, -8 }, - { 1, -8, 10, 1, 64, -8 }, - { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 16 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -774,7 +492,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -808,6 +530,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -849,6 +575,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -951,26 +681,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -978,25 +708,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1134,20 +1068,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1159,17 +1087,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1195,93 +1126,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1323,7 +1224,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1346,7 +1247,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1355,25 +1256,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1383,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1400,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1506,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index b583fc85fe..a3fb48fbb8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 18; //static const int ncomb = 32; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f index 8843b88a23..81d6a09df0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f index b22dde0f92..8668bbcb4d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc new file mode 100644 index 0000000000..b76aa16029 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 64, -8, -8, 1, 1, 10 }, + { -8, 64, 1, 10, -8, 1 }, + { -8, 1, 64, -8, 10, 1 }, + { 1, 10, -8, 64, 1, -8 }, + { 1, -8, 10, 1, 64, -8 }, + { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagram_boilerplate.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagrams.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagrams.h new file mode 100644 index 0000000000..8ea15aedfa --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagrams.h @@ -0,0 +1,515 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 16 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 16 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 16 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 16 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 16 *** + // Wavefunction(s) for diagram number 5 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 16 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 16 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 16 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 16 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 16 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 16 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 16 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 16 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 16 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 16 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 16 *** + // Wavefunction(s) for diagram number 16 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f index c2eadb2c31..10332b6238 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f index 7d44ae130e..76a34107bc 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -323,7 +323,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -366,7 +366,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(9) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -409,43 +410,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /7.111111111111111D+00, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D - $ +00/ + DATA DENOM/9/ + DATA (CF(I),I= 1, 6) /64,-16,-16,2,2,20/ C 1 T(1,2,5,3,4) - DATA (CF(I, 2),I= 1, 6) /-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D - $ +00,-8.888888888888888D-01,1.111111111111111D-01/ + DATA (CF(I),I= 7, 11) /64,2,20,-16,2/ C 1 T(1,5,2,3,4) - DATA (CF(I, 3),I= 1, 6) /-8.888888888888888D-01 - $ ,1.111111111111111D-01,7.111111111111111D+00, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01/ + DATA (CF(I),I= 12, 15) /64,-16,20,2/ C 1 T(2,1,5,3,4) - DATA (CF(I, 4),I= 1, 6) /1.111111111111111D-01 - $ ,1.111111111111111D+00,-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01, - $ -8.888888888888888D-01/ + DATA (CF(I),I= 16, 18) /64,2,-16/ C 1 T(2,5,1,3,4) - DATA (CF(I, 5),I= 1, 6) /1.111111111111111D-01, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01,7.111111111111111D+00,-8.888888888888888D-01/ + DATA (CF(I),I= 19, 20) /64,-16/ C 1 T(5,1,2,3,4) - DATA (CF(I, 6),I= 1, 6) /1.111111111111111D+00 - $ ,1.111111111111111D-01,1.111111111111111D-01, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,7.111111111111111D+00/ + DATA (CF(I),I= 21, 21) /64/ C 1 T(5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -549,10 +539,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -561,6 +553,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data b/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/madevent b/epochX/cudacpp/gg_tt01g.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/madevent +++ b/epochX/cudacpp/gg_tt01g.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h index ff9f0d7f00..cb66251689 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -950,7 +950,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -964,7 +964,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -978,7 +978,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -991,7 +991,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1014,7 +1014,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1027,7 +1027,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1052,7 +1052,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1065,7 +1065,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1077,7 +1077,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1090,7 +1090,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1109,7 +1109,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1122,7 +1122,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1141,7 +1141,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1154,7 +1154,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1172,7 +1172,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1187,7 +1187,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1207,7 +1207,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -1222,7 +1222,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1242,7 +1242,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1257,7 +1257,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc index 47a3a011b8..fd5642f3e3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index 76066c7bb1..f4b086fc96 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -310,7 +310,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -320,12 +320,12 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; GC_12s_sv = couplings_sv.GC_12; diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index c216de0edd..ed4804611b 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006003379821777344  +DEBUG: model prefixing takes 0.005472660064697266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.021 s +1 processes with 16 diagrams generated in 0.022 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -177,9 +177,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1665]  Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s Wrote files for 36 helas calls in 0.123 s ALOHA: aloha starts to compute helicity amplitudes @@ -188,14 +188,14 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.331 s +ALOHA: aloha creates 5 routines in 0.316 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.310 s +ALOHA: aloha creates 10 routines in 0.298 s VVV1 VVV1 FFV1 @@ -205,37 +205,37 @@ ALOHA: aloha creates 10 routines in 0.310 s VVVV1 VVVV3 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 243 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.439s -user 0m2.135s -sys 0m0.297s +real 0m2.462s +user 0m2.188s +sys 0m0.274s Code generation completed in 2 seconds ************************************************************ * * @@ -249,7 +249,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -257,9 +257,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -279,7 +279,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -287,9 +287,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat index 3ace6e558c..36bb202386 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat index d087670827..a16ea5dee6 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat index 43e93cbf40..cdcd77f36d 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttg.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_ttg.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_ttg.mad/Source/makefile b/epochX/cudacpp/gg_ttg.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/makefile +++ b/epochX/cudacpp/gg_ttg.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 5de1c626c8..1db10f1e09 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,500 +279,165 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 16 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 2 OF 16 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 16 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 16 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 5 OF 16 *** - - // Wavefunction(s) for diagram number 5 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 16 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 7 OF 16 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 8 OF 16 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 16 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 10 OF 16 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 16 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 16 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 13 OF 16 *** - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 14 OF 16 *** - - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 15 OF 16 *** - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 16 OF 16 *** - - // Wavefunction(s) for diagram number 16 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 64, -8, -8, 1, 1, 10 }, - { -8, 64, 1, 10, -8, 1 }, - { -8, 1, 64, -8, 10, 1 }, - { 1, 10, -8, 64, 1, -8 }, - { 1, -8, 10, 1, 64, -8 }, - { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 16 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -774,7 +492,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -808,6 +530,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -849,6 +575,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -951,26 +681,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -978,25 +708,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1134,20 +1068,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1159,17 +1087,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1195,93 +1126,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1323,7 +1224,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1346,7 +1247,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1355,25 +1256,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1383,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1400,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1506,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 2acfa000a7..8e87baf8e2 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 18; //static const int ncomb = 32; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index 10496aa04d..163076da52 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index 7c8695090c..bc9333bb5d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc new file mode 100644 index 0000000000..b76aa16029 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 64, -8, -8, 1, 1, 10 }, + { -8, 64, 1, 10, -8, 1 }, + { -8, 1, 64, -8, 10, 1 }, + { 1, 10, -8, 64, 1, -8 }, + { 1, -8, 10, 1, 64, -8 }, + { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagrams.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagrams.h new file mode 100644 index 0000000000..8ea15aedfa --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagrams.h @@ -0,0 +1,515 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 16 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 16 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 16 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 16 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 16 *** + // Wavefunction(s) for diagram number 5 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 16 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 16 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 16 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 16 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 16 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 16 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 16 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 16 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 16 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 16 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 16 *** + // Wavefunction(s) for diagram number 16 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f index c2eadb2c31..10332b6238 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f index 797b19405d..850d121618 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -323,7 +323,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -366,7 +366,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(9) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -409,43 +410,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /7.111111111111111D+00, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D - $ +00/ + DATA DENOM/9/ + DATA (CF(I),I= 1, 6) /64,-16,-16,2,2,20/ C 1 T(1,2,5,3,4) - DATA (CF(I, 2),I= 1, 6) /-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D - $ +00,-8.888888888888888D-01,1.111111111111111D-01/ + DATA (CF(I),I= 7, 11) /64,2,20,-16,2/ C 1 T(1,5,2,3,4) - DATA (CF(I, 3),I= 1, 6) /-8.888888888888888D-01 - $ ,1.111111111111111D-01,7.111111111111111D+00, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01/ + DATA (CF(I),I= 12, 15) /64,-16,20,2/ C 1 T(2,1,5,3,4) - DATA (CF(I, 4),I= 1, 6) /1.111111111111111D-01 - $ ,1.111111111111111D+00,-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01, - $ -8.888888888888888D-01/ + DATA (CF(I),I= 16, 18) /64,2,-16/ C 1 T(2,5,1,3,4) - DATA (CF(I, 5),I= 1, 6) /1.111111111111111D-01, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01,7.111111111111111D+00,-8.888888888888888D-01/ + DATA (CF(I),I= 19, 20) /64,-16/ C 1 T(5,1,2,3,4) - DATA (CF(I, 6),I= 1, 6) /1.111111111111111D+00 - $ ,1.111111111111111D-01,1.111111111111111D-01, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,7.111111111111111D+00/ + DATA (CF(I),I= 21, 21) /64/ C 1 T(5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -549,10 +539,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -561,6 +553,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_ttg.mad/bin/madevent b/epochX/cudacpp/gg_ttg.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/madevent +++ b/epochX/cudacpp/gg_ttg.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h index ff9f0d7f00..cb66251689 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -950,7 +950,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -964,7 +964,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -978,7 +978,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -991,7 +991,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1014,7 +1014,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1027,7 +1027,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1052,7 +1052,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1065,7 +1065,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1077,7 +1077,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1090,7 +1090,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1109,7 +1109,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1122,7 +1122,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1141,7 +1141,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1154,7 +1154,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1172,7 +1172,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1187,7 +1187,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1207,7 +1207,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -1222,7 +1222,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1242,7 +1242,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1257,7 +1257,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc index 47a3a011b8..fd5642f3e3 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index 76066c7bb1..f4b086fc96 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -310,7 +310,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -320,12 +320,12 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; GC_12s_sv = couplings_sv.GC_12; diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 882c93c3a5..cd5159694a 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006311178207397461  +DEBUG: model prefixing takes 0.00543212890625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,21 +155,21 @@ INFO: Process has 16 diagrams Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -177,7 +177,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.323 s +ALOHA: aloha creates 5 routines in 0.318 s VVV1 VVV1 FFV1 @@ -187,17 +187,17 @@ ALOHA: aloha creates 5 routines in 0.323 s VVVV1 VVVV3 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.791s -user 0m0.727s -sys 0m0.049s -Code generation completed in 0 seconds +real 0m0.792s +user 0m0.730s +sys 0m0.045s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index bf77ac9970..1db10f1e09 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,494 +279,165 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 16 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 2 OF 16 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 16 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 16 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 5 OF 16 *** - - // Wavefunction(s) for diagram number 5 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 16 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 7 OF 16 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 8 OF 16 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 16 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 10 OF 16 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 16 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 16 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 13 OF 16 *** - - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 14 OF 16 *** - - // Wavefunction(s) for diagram number 14 - // (none) - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 15 OF 16 *** - - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 16 OF 16 *** - - // Wavefunction(s) for diagram number 16 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] ); - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 64, -8, -8, 1, 1, 10 }, - { -8, 64, 1, 10, -8, 1 }, - { -8, 1, 64, -8, 10, 1 }, - { 1, 10, -8, 64, 1, -8 }, - { 1, -8, 10, 1, 64, -8 }, - { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 16 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -768,7 +492,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -802,6 +530,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -843,6 +575,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -945,26 +681,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -972,25 +708,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1135,13 +1075,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1153,17 +1087,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1189,93 +1126,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1317,7 +1224,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1340,7 +1247,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1349,25 +1256,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1377,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1394,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1500,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 2acfa000a7..8e87baf8e2 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 18; //static const int ncomb = 32; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc new file mode 100644 index 0000000000..b76aa16029 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 64, -8, -8, 1, 1, 10 }, + { -8, 64, 1, 10, -8, 1 }, + { -8, 1, 64, -8, 10, 1 }, + { 1, 10, -8, 64, 1, -8 }, + { 1, -8, 10, 1, 64, -8 }, + { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagrams.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagrams.h new file mode 100644 index 0000000000..91d334bc4e --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagrams.h @@ -0,0 +1,509 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 16 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 16 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 16 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 16 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 16 *** + // Wavefunction(s) for diagram number 5 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 16 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 16 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 16 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 16 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 16 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 16 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 16 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 16 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 16 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 16 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 16 *** + // Wavefunction(s) for diagram number 16 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h index ff9f0d7f00..cb66251689 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -950,7 +950,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -964,7 +964,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -978,7 +978,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -991,7 +991,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1014,7 +1014,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1027,7 +1027,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1052,7 +1052,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1065,7 +1065,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1077,7 +1077,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1090,7 +1090,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1109,7 +1109,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1122,7 +1122,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1141,7 +1141,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1154,7 +1154,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1172,7 +1172,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1187,7 +1187,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1207,7 +1207,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -1222,7 +1222,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1242,7 +1242,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1257,7 +1257,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc index 47a3a011b8..fd5642f3e3 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index 76066c7bb1..f4b086fc96 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -310,7 +310,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -320,12 +320,12 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; GC_12s_sv = couplings_sv.GC_12; diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 78cdfd68b2..4b41dc7b62 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006547212600708008  +DEBUG: model prefixing takes 0.005442380905151367  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.171 s +1 processes with 123 diagrams generated in 0.158 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -177,25 +177,25 @@ FileWriter t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s -Wrote files for 222 helas calls in 0.660 s +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1665]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.424 s +Wrote files for 222 helas calls in 0.655 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.328 s +ALOHA: aloha creates 5 routines in 0.327 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.313 s +ALOHA: aloha creates 10 routines in 0.306 s VVV1 VVV1 FFV1 @@ -208,38 +208,38 @@ ALOHA: aloha creates 10 routines in 0.313 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 275 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m4.934s -user 0m3.516s -sys 0m0.277s -Code generation completed in 5 seconds +real 0m3.826s +user 0m3.524s +sys 0m0.296s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * @@ -252,7 +252,7 @@ Code generation completed in 5 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -260,9 +260,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -282,7 +282,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -290,9 +290,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat index 1fa5e235b3..dcbb38ba34 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat index ecdc7fd25c..964b954d74 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat index 7ec841d6c2..308f5bed4f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/makefile b/epochX/cudacpp/gg_ttgg.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/makefile +++ b/epochX/cudacpp/gg_ttgg.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index c508e73f26..5e1fba0c34 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 24; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,2412 +279,379 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 123 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 1 - VVVV1_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 123 *** - - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 123 *** - - // Wavefunction(s) for diagram number 3 - VVV1P0_1( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 123 *** - - // Wavefunction(s) for diagram number 4 - VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 4 - VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 123 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 123 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 7 OF 123 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 8 OF 123 *** - - // Wavefunction(s) for diagram number 8 - FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 123 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 10 OF 123 *** - - // Wavefunction(s) for diagram number 10 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 123 *** - - // Wavefunction(s) for diagram number 11 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 123 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 13 OF 123 *** - - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 14 OF 123 *** - - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 15 OF 123 *** - - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - - // *** DIAGRAM 16 OF 123 *** - - // Wavefunction(s) for diagram number 16 - // (none) - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 17 OF 123 *** - - // Wavefunction(s) for diagram number 17 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_1( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 18 OF 123 *** - - // Wavefunction(s) for diagram number 18 - FFV1_1( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 18 - FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 19 OF 123 *** - - // Wavefunction(s) for diagram number 19 - // (none) - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 20 OF 123 *** - - // Wavefunction(s) for diagram number 20 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 21 OF 123 *** - - // Wavefunction(s) for diagram number 21 - // (none) - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 22 OF 123 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 23 OF 123 *** - - // Wavefunction(s) for diagram number 23 - VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] ); - - // Amplitude(s) for diagram number 23 - VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 24 OF 123 *** - - // Wavefunction(s) for diagram number 24 - // (none) - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 25 OF 123 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 26 OF 123 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 27 OF 123 *** - - // Wavefunction(s) for diagram number 27 - // (none) - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 28 OF 123 *** - - // Wavefunction(s) for diagram number 28 - // (none) - - // Amplitude(s) for diagram number 28 - FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 29 OF 123 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 30 OF 123 *** - - // Wavefunction(s) for diagram number 30 - // (none) - - // Amplitude(s) for diagram number 30 - FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 31 OF 123 *** - - // Wavefunction(s) for diagram number 31 - // (none) - - // Amplitude(s) for diagram number 31 - VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 32 OF 123 *** - - // Wavefunction(s) for diagram number 32 - VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); - VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] ); - VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 33 OF 123 *** - - // Wavefunction(s) for diagram number 33 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 34 OF 123 *** - - // Wavefunction(s) for diagram number 34 - FFV1_2( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 34 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] -= amp_sv[0]; - - // *** DIAGRAM 35 OF 123 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 36 OF 123 *** - - // Wavefunction(s) for diagram number 36 - FFV1P0_3( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 36 - VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 37 OF 123 *** - - // Wavefunction(s) for diagram number 37 - // (none) - - // Amplitude(s) for diagram number 37 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 38 OF 123 *** - - // Wavefunction(s) for diagram number 38 - // (none) - - // Amplitude(s) for diagram number 38 - FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 39 OF 123 *** - - // Wavefunction(s) for diagram number 39 - // (none) - - // Amplitude(s) for diagram number 39 - VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 40 OF 123 *** - // Wavefunction(s) for diagram number 40 - // (none) - - // Amplitude(s) for diagram number 40 - FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 41 OF 123 *** - - // Wavefunction(s) for diagram number 41 - // (none) - - // Amplitude(s) for diagram number 41 - FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 42 OF 123 *** - - // Wavefunction(s) for diagram number 42 - FFV1_2( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 42 - FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 43 OF 123 *** - - // Wavefunction(s) for diagram number 43 - // (none) - - // Amplitude(s) for diagram number 43 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] -= amp_sv[0]; - - // *** DIAGRAM 44 OF 123 *** - - // Wavefunction(s) for diagram number 44 - // (none) - - // Amplitude(s) for diagram number 44 - FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 45 OF 123 *** - - // Wavefunction(s) for diagram number 45 - // (none) - - // Amplitude(s) for diagram number 45 - FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 46 OF 123 *** - - // Wavefunction(s) for diagram number 46 - // (none) - - // Amplitude(s) for diagram number 46 - FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 47 OF 123 *** - - // Wavefunction(s) for diagram number 47 - // (none) - - // Amplitude(s) for diagram number 47 - VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 48 OF 123 *** - - // Wavefunction(s) for diagram number 48 - // (none) - - // Amplitude(s) for diagram number 48 - FFV1_0( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 49 OF 123 *** - - // Wavefunction(s) for diagram number 49 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] ); - FFV1_2( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); - - // Amplitude(s) for diagram number 49 - FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 50 OF 123 *** - - // Wavefunction(s) for diagram number 50 - VVV1P0_1( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 50 - FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 51 OF 123 *** - - // Wavefunction(s) for diagram number 51 - // (none) - - // Amplitude(s) for diagram number 51 - FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 52 OF 123 *** - - // Wavefunction(s) for diagram number 52 - FFV1_1( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 52 - FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 53 OF 123 *** - - // Wavefunction(s) for diagram number 53 - // (none) - - // Amplitude(s) for diagram number 53 - FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 54 OF 123 *** - - // Wavefunction(s) for diagram number 54 - // (none) - - // Amplitude(s) for diagram number 54 - FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 55 OF 123 *** - - // Wavefunction(s) for diagram number 55 - // (none) - - // Amplitude(s) for diagram number 55 - FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - - // *** DIAGRAM 56 OF 123 *** - - // Wavefunction(s) for diagram number 56 - // (none) - - // Amplitude(s) for diagram number 56 - FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - - // *** DIAGRAM 57 OF 123 *** - - // Wavefunction(s) for diagram number 57 - // (none) - - // Amplitude(s) for diagram number 57 - VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 58 OF 123 *** - - // Wavefunction(s) for diagram number 58 - // (none) - - // Amplitude(s) for diagram number 58 - VVVV1_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 59 OF 123 *** - - // Wavefunction(s) for diagram number 59 - VVV1P0_1( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 59 - VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 60 OF 123 *** - - // Wavefunction(s) for diagram number 60 - // (none) - - // Amplitude(s) for diagram number 60 - VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 61 OF 123 *** - - // Wavefunction(s) for diagram number 61 - // (none) - - // Amplitude(s) for diagram number 61 - FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 62 OF 123 *** - - // Wavefunction(s) for diagram number 62 - // (none) - - // Amplitude(s) for diagram number 62 - FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 63 OF 123 *** - - // Wavefunction(s) for diagram number 63 - // (none) - - // Amplitude(s) for diagram number 63 - FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 64 OF 123 *** - - // Wavefunction(s) for diagram number 64 - // (none) - - // Amplitude(s) for diagram number 64 - FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 65 OF 123 *** - - // Wavefunction(s) for diagram number 65 - VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); - FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 65 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 66 OF 123 *** - - // Wavefunction(s) for diagram number 66 - VVV1P0_1( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 66 - FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 67 OF 123 *** - - // Wavefunction(s) for diagram number 67 - // (none) - - // Amplitude(s) for diagram number 67 - FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 68 OF 123 *** - - // Wavefunction(s) for diagram number 68 - FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 68 - FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 69 OF 123 *** - - // Wavefunction(s) for diagram number 69 - // (none) - - // Amplitude(s) for diagram number 69 - FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 70 OF 123 *** - - // Wavefunction(s) for diagram number 70 - // (none) - - // Amplitude(s) for diagram number 70 - FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 71 OF 123 *** - - // Wavefunction(s) for diagram number 71 - // (none) - - // Amplitude(s) for diagram number 71 - FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - - // *** DIAGRAM 72 OF 123 *** - - // Wavefunction(s) for diagram number 72 - // (none) - - // Amplitude(s) for diagram number 72 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - - // *** DIAGRAM 73 OF 123 *** - - // Wavefunction(s) for diagram number 73 - // (none) - - // Amplitude(s) for diagram number 73 - VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 74 OF 123 *** - - // Wavefunction(s) for diagram number 74 - // (none) - - // Amplitude(s) for diagram number 74 - VVVV1_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 75 OF 123 *** - - // Wavefunction(s) for diagram number 75 - VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 75 - VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 76 OF 123 *** - - // Wavefunction(s) for diagram number 76 - // (none) - - // Amplitude(s) for diagram number 76 - VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 77 OF 123 *** - - // Wavefunction(s) for diagram number 77 - // (none) - - // Amplitude(s) for diagram number 77 - FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 78 OF 123 *** - - // Wavefunction(s) for diagram number 78 - // (none) - - // Amplitude(s) for diagram number 78 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 79 OF 123 *** - - // Wavefunction(s) for diagram number 79 - // (none) - - // Amplitude(s) for diagram number 79 - FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 80 OF 123 *** - - // Wavefunction(s) for diagram number 80 - // (none) - - // Amplitude(s) for diagram number 80 - FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 81 OF 123 *** - - // Wavefunction(s) for diagram number 81 - FFV1_1( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 81 - FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= amp_sv[0]; - - // *** DIAGRAM 82 OF 123 *** - - // Wavefunction(s) for diagram number 82 - FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 82 - FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 83 OF 123 *** - - // Wavefunction(s) for diagram number 83 - // (none) - - // Amplitude(s) for diagram number 83 - FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= amp_sv[0]; - - // *** DIAGRAM 84 OF 123 *** - - // Wavefunction(s) for diagram number 84 - FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 84 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= amp_sv[0]; - - // *** DIAGRAM 85 OF 123 *** - - // Wavefunction(s) for diagram number 85 - // (none) - - // Amplitude(s) for diagram number 85 - FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 86 OF 123 *** - - // Wavefunction(s) for diagram number 86 - VVV1P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 86 - FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 87 OF 123 *** - - // Wavefunction(s) for diagram number 87 - FFV1_2( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); - - // Amplitude(s) for diagram number 87 - FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 88 OF 123 *** - - // Wavefunction(s) for diagram number 88 - FFV1_1( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 88 - FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] -= amp_sv[0]; - - // *** DIAGRAM 89 OF 123 *** - - // Wavefunction(s) for diagram number 89 - // (none) - - // Amplitude(s) for diagram number 89 - FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 90 OF 123 *** - - // Wavefunction(s) for diagram number 90 - FFV1_1( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); - - // Amplitude(s) for diagram number 90 - FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 91 OF 123 *** - - // Wavefunction(s) for diagram number 91 - // (none) - - // Amplitude(s) for diagram number 91 - FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 92 OF 123 *** - - // Wavefunction(s) for diagram number 92 - // (none) - - // Amplitude(s) for diagram number 92 - FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 93 OF 123 *** - - // Wavefunction(s) for diagram number 93 - // (none) - - // Amplitude(s) for diagram number 93 - VVVV1_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 94 OF 123 *** - - // Wavefunction(s) for diagram number 94 - VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 94 - VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 95 OF 123 *** - - // Wavefunction(s) for diagram number 95 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] ); - - // Amplitude(s) for diagram number 95 - VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 96 OF 123 *** - - // Wavefunction(s) for diagram number 96 - // (none) - - // Amplitude(s) for diagram number 96 - FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 97 OF 123 *** - - // Wavefunction(s) for diagram number 97 - // (none) - - // Amplitude(s) for diagram number 97 - FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 98 OF 123 *** - - // Wavefunction(s) for diagram number 98 - // (none) - - // Amplitude(s) for diagram number 98 - FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 99 OF 123 *** - - // Wavefunction(s) for diagram number 99 - // (none) - - // Amplitude(s) for diagram number 99 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 100 OF 123 *** - - // Wavefunction(s) for diagram number 100 - // (none) - - // Amplitude(s) for diagram number 100 - VVVV1_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 101 OF 123 *** - - // Wavefunction(s) for diagram number 101 - VVV1P0_1( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 101 - VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 102 OF 123 *** - - // Wavefunction(s) for diagram number 102 - // (none) - - // Amplitude(s) for diagram number 102 - VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 103 OF 123 *** - - // Wavefunction(s) for diagram number 103 - // (none) - - // Amplitude(s) for diagram number 103 - FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 104 OF 123 *** - - // Wavefunction(s) for diagram number 104 - // (none) - - // Amplitude(s) for diagram number 104 - FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 105 OF 123 *** - - // Wavefunction(s) for diagram number 105 - // (none) - - // Amplitude(s) for diagram number 105 - FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 106 OF 123 *** - - // Wavefunction(s) for diagram number 106 - // (none) - - // Amplitude(s) for diagram number 106 - FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 107 OF 123 *** - - // Wavefunction(s) for diagram number 107 - // (none) - - // Amplitude(s) for diagram number 107 - VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 108 OF 123 *** - - // Wavefunction(s) for diagram number 108 - // (none) - - // Amplitude(s) for diagram number 108 - VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 109 OF 123 *** - - // Wavefunction(s) for diagram number 109 - // (none) - - // Amplitude(s) for diagram number 109 - VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 110 OF 123 *** - - // Wavefunction(s) for diagram number 110 - // (none) - - // Amplitude(s) for diagram number 110 - FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 111 OF 123 *** - - // Wavefunction(s) for diagram number 111 - // (none) - - // Amplitude(s) for diagram number 111 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 112 OF 123 *** - - // Wavefunction(s) for diagram number 112 - // (none) - - // Amplitude(s) for diagram number 112 - FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 113 OF 123 *** - - // Wavefunction(s) for diagram number 113 - // (none) - - // Amplitude(s) for diagram number 113 - FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 114 OF 123 *** - - // Wavefunction(s) for diagram number 114 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 114 - VVV1_0( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 115 OF 123 *** - - // Wavefunction(s) for diagram number 115 - // (none) - - // Amplitude(s) for diagram number 115 - FFV1_0( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 116 OF 123 *** - - // Wavefunction(s) for diagram number 116 - // (none) - - // Amplitude(s) for diagram number 116 - FFV1_0( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - FFV1_0( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 117 OF 123 *** - - // Wavefunction(s) for diagram number 117 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 117 - VVV1_0( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 118 OF 123 *** - - // Wavefunction(s) for diagram number 118 - // (none) - - // Amplitude(s) for diagram number 118 - FFV1_0( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 119 OF 123 *** - - // Wavefunction(s) for diagram number 119 - // (none) - - // Amplitude(s) for diagram number 119 - FFV1_0( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - FFV1_0( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - FFV1_0( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 120 OF 123 *** - - // Wavefunction(s) for diagram number 120 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); - - // Amplitude(s) for diagram number 120 - FFV1_0( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 121 OF 123 *** - - // Wavefunction(s) for diagram number 121 - // (none) - - // Amplitude(s) for diagram number 121 - FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - FFV1_0( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 122 OF 123 *** - - // Wavefunction(s) for diagram number 122 - // (none) - - // Amplitude(s) for diagram number 122 - VVV1_0( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 123 OF 123 *** - - // Wavefunction(s) for diagram number 123 - // (none) - - // Amplitude(s) for diagram number 123 - VVV1_0( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxgg()?) - - // The color denominators (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] - - // The color matrix (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, - { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, - { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, - { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, - { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, - { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, - { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, - { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, - { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, - { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, - { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, - { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, - { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, - { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, - { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, - { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, - { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, - { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, - { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, - { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, - { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, - { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, - { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, - { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 123 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -2718,7 +738,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -2753,6 +777,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -2795,6 +823,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -2897,26 +929,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -2924,25 +956,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -3080,20 +1316,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -3105,17 +1335,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -3141,93 +1374,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3269,7 +1472,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -3292,7 +1495,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -3301,25 +1504,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -3329,8 +1538,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -3346,11 +1557,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -3452,14 +1664,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 2b75e0f842..f142e7ef7d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 159; //static const int ncomb = 64; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f index c087f3f747..a2dfd42919 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f index ce5493be9b..2d8197d859 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc new file mode 100644 index 0000000000..02db3d0204 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc @@ -0,0 +1,405 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] + + // The color matrix (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, + { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, + { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, + { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, + { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, + { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, + { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, + { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, + { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, + { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, + { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, + { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, + { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, + { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, + { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, + { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, + { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, + { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, + { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, + { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, + { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, + { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, + { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, + { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagrams.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagrams.h new file mode 100644 index 0000000000..08f07c1187 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagrams.h @@ -0,0 +1,4120 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 123 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 1 + VVVV1_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 123 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 123 *** + // Wavefunction(s) for diagram number 3 + VVV1P0_1( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 123 *** + // Wavefunction(s) for diagram number 4 + VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 4 + VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 123 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 123 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 123 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 123 *** + // Wavefunction(s) for diagram number 8 + FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 123 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 123 *** + // Wavefunction(s) for diagram number 10 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 123 *** + // Wavefunction(s) for diagram number 11 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 123 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 123 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 123 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 123 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 123 *** + // Wavefunction(s) for diagram number 16 + // (none) + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 123 *** + // Wavefunction(s) for diagram number 17 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_1( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 123 *** + // Wavefunction(s) for diagram number 18 + FFV1_1( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 18 + FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 123 *** + // Wavefunction(s) for diagram number 19 + // (none) + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 123 *** + // Wavefunction(s) for diagram number 20 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 123 *** + // Wavefunction(s) for diagram number 21 + // (none) + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 123 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 123 *** + // Wavefunction(s) for diagram number 23 + VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] ); + // Amplitude(s) for diagram number 23 + VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 123 *** + // Wavefunction(s) for diagram number 24 + // (none) + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 123 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 123 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 123 *** + // Wavefunction(s) for diagram number 27 + // (none) + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 123 *** + // Wavefunction(s) for diagram number 28 + // (none) + // Amplitude(s) for diagram number 28 + FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 123 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 123 *** + // Wavefunction(s) for diagram number 30 + // (none) + // Amplitude(s) for diagram number 30 + FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 123 *** + // Wavefunction(s) for diagram number 31 + // (none) + // Amplitude(s) for diagram number 31 + VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 123 *** + // Wavefunction(s) for diagram number 32 + VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); + VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] ); + VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 123 *** + // Wavefunction(s) for diagram number 33 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 123 *** + // Wavefunction(s) for diagram number 34 + FFV1_2( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 34 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 123 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 123 *** + // Wavefunction(s) for diagram number 36 + FFV1P0_3( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 36 + VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram37( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 37 OF 123 *** + // Wavefunction(s) for diagram number 37 + // (none) + // Amplitude(s) for diagram number 37 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram38( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 38 OF 123 *** + // Wavefunction(s) for diagram number 38 + // (none) + // Amplitude(s) for diagram number 38 + FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram39( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 39 OF 123 *** + // Wavefunction(s) for diagram number 39 + // (none) + // Amplitude(s) for diagram number 39 + VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram40( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 40 OF 123 *** + // Wavefunction(s) for diagram number 40 + // (none) + // Amplitude(s) for diagram number 40 + FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram41( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 41 OF 123 *** + // Wavefunction(s) for diagram number 41 + // (none) + // Amplitude(s) for diagram number 41 + FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram42( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 42 OF 123 *** + // Wavefunction(s) for diagram number 42 + FFV1_2( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 42 + FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram43( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 43 OF 123 *** + // Wavefunction(s) for diagram number 43 + // (none) + // Amplitude(s) for diagram number 43 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram44( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 44 OF 123 *** + // Wavefunction(s) for diagram number 44 + // (none) + // Amplitude(s) for diagram number 44 + FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram45( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 45 OF 123 *** + // Wavefunction(s) for diagram number 45 + // (none) + // Amplitude(s) for diagram number 45 + FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram46( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 46 OF 123 *** + // Wavefunction(s) for diagram number 46 + // (none) + // Amplitude(s) for diagram number 46 + FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram47( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 47 OF 123 *** + // Wavefunction(s) for diagram number 47 + // (none) + // Amplitude(s) for diagram number 47 + VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram48( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 48 OF 123 *** + // Wavefunction(s) for diagram number 48 + // (none) + // Amplitude(s) for diagram number 48 + FFV1_0( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram49( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 49 OF 123 *** + // Wavefunction(s) for diagram number 49 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] ); + FFV1_2( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); + // Amplitude(s) for diagram number 49 + FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram50( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 50 OF 123 *** + // Wavefunction(s) for diagram number 50 + VVV1P0_1( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 50 + FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram51( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 51 OF 123 *** + // Wavefunction(s) for diagram number 51 + // (none) + // Amplitude(s) for diagram number 51 + FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram52( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 52 OF 123 *** + // Wavefunction(s) for diagram number 52 + FFV1_1( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 52 + FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram53( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 53 OF 123 *** + // Wavefunction(s) for diagram number 53 + // (none) + // Amplitude(s) for diagram number 53 + FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram54( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 54 OF 123 *** + // Wavefunction(s) for diagram number 54 + // (none) + // Amplitude(s) for diagram number 54 + FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram55( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 55 OF 123 *** + // Wavefunction(s) for diagram number 55 + // (none) + // Amplitude(s) for diagram number 55 + FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram56( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 56 OF 123 *** + // Wavefunction(s) for diagram number 56 + // (none) + // Amplitude(s) for diagram number 56 + FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram57( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 57 OF 123 *** + // Wavefunction(s) for diagram number 57 + // (none) + // Amplitude(s) for diagram number 57 + VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram58( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 58 OF 123 *** + // Wavefunction(s) for diagram number 58 + // (none) + // Amplitude(s) for diagram number 58 + VVVV1_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram59( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 59 OF 123 *** + // Wavefunction(s) for diagram number 59 + VVV1P0_1( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 59 + VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram60( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 60 OF 123 *** + // Wavefunction(s) for diagram number 60 + // (none) + // Amplitude(s) for diagram number 60 + VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram61( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 61 OF 123 *** + // Wavefunction(s) for diagram number 61 + // (none) + // Amplitude(s) for diagram number 61 + FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram62( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 62 OF 123 *** + // Wavefunction(s) for diagram number 62 + // (none) + // Amplitude(s) for diagram number 62 + FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram63( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 63 OF 123 *** + // Wavefunction(s) for diagram number 63 + // (none) + // Amplitude(s) for diagram number 63 + FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram64( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 64 OF 123 *** + // Wavefunction(s) for diagram number 64 + // (none) + // Amplitude(s) for diagram number 64 + FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram65( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 65 OF 123 *** + // Wavefunction(s) for diagram number 65 + VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); + FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 65 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram66( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 66 OF 123 *** + // Wavefunction(s) for diagram number 66 + VVV1P0_1( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 66 + FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram67( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 67 OF 123 *** + // Wavefunction(s) for diagram number 67 + // (none) + // Amplitude(s) for diagram number 67 + FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram68( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 68 OF 123 *** + // Wavefunction(s) for diagram number 68 + FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 68 + FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram69( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 69 OF 123 *** + // Wavefunction(s) for diagram number 69 + // (none) + // Amplitude(s) for diagram number 69 + FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram70( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 70 OF 123 *** + // Wavefunction(s) for diagram number 70 + // (none) + // Amplitude(s) for diagram number 70 + FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram71( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 71 OF 123 *** + // Wavefunction(s) for diagram number 71 + // (none) + // Amplitude(s) for diagram number 71 + FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram72( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 72 OF 123 *** + // Wavefunction(s) for diagram number 72 + // (none) + // Amplitude(s) for diagram number 72 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram73( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 73 OF 123 *** + // Wavefunction(s) for diagram number 73 + // (none) + // Amplitude(s) for diagram number 73 + VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram74( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 74 OF 123 *** + // Wavefunction(s) for diagram number 74 + // (none) + // Amplitude(s) for diagram number 74 + VVVV1_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram75( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 75 OF 123 *** + // Wavefunction(s) for diagram number 75 + VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 75 + VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram76( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 76 OF 123 *** + // Wavefunction(s) for diagram number 76 + // (none) + // Amplitude(s) for diagram number 76 + VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram77( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 77 OF 123 *** + // Wavefunction(s) for diagram number 77 + // (none) + // Amplitude(s) for diagram number 77 + FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram78( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 78 OF 123 *** + // Wavefunction(s) for diagram number 78 + // (none) + // Amplitude(s) for diagram number 78 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram79( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 79 OF 123 *** + // Wavefunction(s) for diagram number 79 + // (none) + // Amplitude(s) for diagram number 79 + FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram80( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 80 OF 123 *** + // Wavefunction(s) for diagram number 80 + // (none) + // Amplitude(s) for diagram number 80 + FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram81( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 81 OF 123 *** + // Wavefunction(s) for diagram number 81 + FFV1_1( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 81 + FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram82( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 82 OF 123 *** + // Wavefunction(s) for diagram number 82 + FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 82 + FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram83( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 83 OF 123 *** + // Wavefunction(s) for diagram number 83 + // (none) + // Amplitude(s) for diagram number 83 + FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram84( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 84 OF 123 *** + // Wavefunction(s) for diagram number 84 + FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 84 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram85( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 85 OF 123 *** + // Wavefunction(s) for diagram number 85 + // (none) + // Amplitude(s) for diagram number 85 + FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram86( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 86 OF 123 *** + // Wavefunction(s) for diagram number 86 + VVV1P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 86 + FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram87( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 87 OF 123 *** + // Wavefunction(s) for diagram number 87 + FFV1_2( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); + // Amplitude(s) for diagram number 87 + FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram88( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 88 OF 123 *** + // Wavefunction(s) for diagram number 88 + FFV1_1( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 88 + FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram89( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 89 OF 123 *** + // Wavefunction(s) for diagram number 89 + // (none) + // Amplitude(s) for diagram number 89 + FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram90( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 90 OF 123 *** + // Wavefunction(s) for diagram number 90 + FFV1_1( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); + // Amplitude(s) for diagram number 90 + FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram91( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 91 OF 123 *** + // Wavefunction(s) for diagram number 91 + // (none) + // Amplitude(s) for diagram number 91 + FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram92( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 92 OF 123 *** + // Wavefunction(s) for diagram number 92 + // (none) + // Amplitude(s) for diagram number 92 + FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram93( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 93 OF 123 *** + // Wavefunction(s) for diagram number 93 + // (none) + // Amplitude(s) for diagram number 93 + VVVV1_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram94( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 94 OF 123 *** + // Wavefunction(s) for diagram number 94 + VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 94 + VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram95( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 95 OF 123 *** + // Wavefunction(s) for diagram number 95 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] ); + // Amplitude(s) for diagram number 95 + VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram96( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 96 OF 123 *** + // Wavefunction(s) for diagram number 96 + // (none) + // Amplitude(s) for diagram number 96 + FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram97( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 97 OF 123 *** + // Wavefunction(s) for diagram number 97 + // (none) + // Amplitude(s) for diagram number 97 + FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram98( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 98 OF 123 *** + // Wavefunction(s) for diagram number 98 + // (none) + // Amplitude(s) for diagram number 98 + FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram99( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 99 OF 123 *** + // Wavefunction(s) for diagram number 99 + // (none) + // Amplitude(s) for diagram number 99 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram100( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 100 OF 123 *** + // Wavefunction(s) for diagram number 100 + // (none) + // Amplitude(s) for diagram number 100 + VVVV1_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram101( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 101 OF 123 *** + // Wavefunction(s) for diagram number 101 + VVV1P0_1( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 101 + VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram102( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 102 OF 123 *** + // Wavefunction(s) for diagram number 102 + // (none) + // Amplitude(s) for diagram number 102 + VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram103( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 103 OF 123 *** + // Wavefunction(s) for diagram number 103 + // (none) + // Amplitude(s) for diagram number 103 + FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram104( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 104 OF 123 *** + // Wavefunction(s) for diagram number 104 + // (none) + // Amplitude(s) for diagram number 104 + FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram105( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 105 OF 123 *** + // Wavefunction(s) for diagram number 105 + // (none) + // Amplitude(s) for diagram number 105 + FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram106( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 106 OF 123 *** + // Wavefunction(s) for diagram number 106 + // (none) + // Amplitude(s) for diagram number 106 + FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram107( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 107 OF 123 *** + // Wavefunction(s) for diagram number 107 + // (none) + // Amplitude(s) for diagram number 107 + VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram108( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 108 OF 123 *** + // Wavefunction(s) for diagram number 108 + // (none) + // Amplitude(s) for diagram number 108 + VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram109( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 109 OF 123 *** + // Wavefunction(s) for diagram number 109 + // (none) + // Amplitude(s) for diagram number 109 + VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram110( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 110 OF 123 *** + // Wavefunction(s) for diagram number 110 + // (none) + // Amplitude(s) for diagram number 110 + FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram111( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 111 OF 123 *** + // Wavefunction(s) for diagram number 111 + // (none) + // Amplitude(s) for diagram number 111 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram112( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 112 OF 123 *** + // Wavefunction(s) for diagram number 112 + // (none) + // Amplitude(s) for diagram number 112 + FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram113( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 113 OF 123 *** + // Wavefunction(s) for diagram number 113 + // (none) + // Amplitude(s) for diagram number 113 + FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram114( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 114 OF 123 *** + // Wavefunction(s) for diagram number 114 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 114 + VVV1_0( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram115( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 115 OF 123 *** + // Wavefunction(s) for diagram number 115 + // (none) + // Amplitude(s) for diagram number 115 + FFV1_0( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram116( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 116 OF 123 *** + // Wavefunction(s) for diagram number 116 + // (none) + // Amplitude(s) for diagram number 116 + FFV1_0( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + FFV1_0( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram117( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 117 OF 123 *** + // Wavefunction(s) for diagram number 117 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 117 + VVV1_0( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram118( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 118 OF 123 *** + // Wavefunction(s) for diagram number 118 + // (none) + // Amplitude(s) for diagram number 118 + FFV1_0( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram119( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 119 OF 123 *** + // Wavefunction(s) for diagram number 119 + // (none) + // Amplitude(s) for diagram number 119 + FFV1_0( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + FFV1_0( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + FFV1_0( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram120( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 120 OF 123 *** + // Wavefunction(s) for diagram number 120 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); + // Amplitude(s) for diagram number 120 + FFV1_0( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram121( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 121 OF 123 *** + // Wavefunction(s) for diagram number 121 + // (none) + // Amplitude(s) for diagram number 121 + FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + FFV1_0( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram122( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 122 OF 123 *** + // Wavefunction(s) for diagram number 122 + // (none) + // Amplitude(s) for diagram number 122 + VVV1_0( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram123( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 123 OF 123 *** + // Wavefunction(s) for diagram number 123 + // (none) + // Amplitude(s) for diagram number 123 + VVV1_0( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f index 3ea53d8b21..7b6fa85360 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -398,7 +398,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(155) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -441,407 +442,81 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 1),I= 7, 12) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 1),I= 13, 18) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 1),I= 19, 24) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ + DATA DENOM/54/ + DATA (CF(I),I= 1, 24) /512,-128,-128,16,16,160,-128,16,16,-2,-2 + $ ,-20,16,-2,160,-20,142,124,-2,-20,-20,124,124,-56/ C 1 T(1,2,5,6,3,4) - DATA (CF(I, 2),I= 1, 6) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 2),I= 7, 12) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 2),I= 13, 18) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 2),I= 19, 24) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ + DATA (CF(I),I= 25, 47) /512,16,160,-128,16,16,-128,-2,-20,16,-2, + $ -2,-20,-20,124,124,-56,16,-2,160,-20,142,124/ C 1 T(1,2,6,5,3,4) - DATA (CF(I, 3),I= 1, 6) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 3),I= 7, 12) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 3),I= 13, 18) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 3),I= 19, 24) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ + DATA (CF(I),I= 48, 69) /512,-128,160,16,16,-2,160,-20,142,124, + $ -128,16,16,-2,-2,-20,-20,-2,124,-56,-20,124/ C 1 T(1,5,2,6,3,4) - DATA (CF(I, 4),I= 1, 6) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 4),I= 7, 12) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 4),I= 13, 18) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 4),I= 19, 24) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ + DATA (CF(I),I= 70, 90) /512,16,-128,-2,-20,-20,124,124,-56,16, + $ -128,-2,-20,16,-2,-2,16,142,124,160,-20/ C 1 T(1,5,6,2,3,4) - DATA (CF(I, 5),I= 1, 6) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 5),I= 7, 12) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 5),I= 13, 18) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 5),I= 19, 24) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ + DATA (CF(I),I= 91,110) /512,-128,-2,16,142,124,160,-20,-20,-2 + $ ,124,-56,-20,124,-128,16,16,-2,-2,-20/ C 1 T(1,6,2,5,3,4) - DATA (CF(I, 6),I= 1, 6) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 6),I= 7, 12) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 6),I= 13, 18) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 6),I= 19, 24) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=111,129) /512,-20,-2,124,-56,-20,124,-2,16,142,124 + $ ,160,-20,16,-128,-2,-20,16,-2/ C 1 T(1,6,5,2,3,4) - DATA (CF(I, 7),I= 1, 6) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 7),I= 7, 12) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 7),I= 13, 18) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 7),I= 19, 24) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ + DATA (CF(I),I=130,147) /512,-128,-128,16,16,160,160,-20,16,-2 + $ ,124,142,-20,124,-2,-20,-56,124/ C 1 T(2,1,5,6,3,4) - DATA (CF(I, 8),I= 1, 6) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 8),I= 7, 12) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 8),I= 13, 18) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 8),I= 19, 24) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ + DATA (CF(I),I=148,164) /512,16,160,-128,16,-20,124,-2,-20,-56 + $ ,124,160,-20,16,-2,124,142/ C 1 T(2,1,6,5,3,4) - DATA (CF(I, 9),I= 1, 6) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 9),I= 7, 12) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 9),I= 13, 18) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 9),I= 19, 24) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ + DATA (CF(I),I=165,180) /512,-128,160,16,16,-2,-128,16,-20,-2,124 + $ ,-56,-20,-2,124,-20/ C 1 T(2,5,1,6,3,4) - DATA (CF(I, 10),I= 1, 6) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 10),I= 7, 12) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 10),I= 13, 18) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 10),I= 19, 24) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ + DATA (CF(I),I=181,195) /512,16,-128,-2,-20,16,-128,-2,16,142,124 + $ ,-2,16,-20,160/ C 1 T(2,5,6,1,3,4) - DATA (CF(I, 11),I= 1, 6) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 11),I= 7, 12) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 11),I= 13, 18) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 11),I= 19, 24) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=196,209) /512,-128,124,-56,-20,-2,124,-20,16,-2, + $ -128,16,-20,-2/ C 1 T(2,6,1,5,3,4) - DATA (CF(I, 12),I= 1, 6) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 12),I= 7, 12) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 12),I= 13, 18) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 12),I= 19, 24) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=210,222) /512,142,124,-2,16,-20,160,-2,-20,16,-128 + $ ,-2,16/ C 1 T(2,6,5,1,3,4) - DATA (CF(I, 13),I= 1, 6) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 13),I= 7, 12) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 13),I= 13, 18) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 13),I= 19, 24) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ + DATA (CF(I),I=223,234) /512,-128,-128,16,16,160,124,-20,-56,124, + $ -2,-20/ C 1 T(5,1,2,6,3,4) - DATA (CF(I, 14),I= 1, 6) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 14),I= 7, 12) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 14),I= 13, 18) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 14),I= 19, 24) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ + DATA (CF(I),I=235,245) /512,16,160,-128,16,-20,160,124,142,16,-2/ C 1 T(5,1,6,2,3,4) - DATA (CF(I, 15),I= 1, 6) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 15),I= 7, 12) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 15),I= 13, 18) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 15),I= 19, 24) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=246,255) /512,-128,160,16,-56,124,124,-20,-20,-2/ C 1 T(5,2,1,6,3,4) - DATA (CF(I, 16),I= 1, 6) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 16),I= 7, 12) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 16),I= 13, 18) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 16),I= 19, 24) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=256,264) /512,16,-128,124,142,-20,160,-2,16/ C 1 T(5,2,6,1,3,4) - DATA (CF(I, 17),I= 1, 6) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 17),I= 7, 12) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 17),I= 13, 18) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 17),I= 19, 24) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=265,272) /512,-128,-2,16,-20,-2,-128,16/ C 1 T(5,6,1,2,3,4) - DATA (CF(I, 18),I= 1, 6) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 18),I= 7, 12) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 18),I= 13, 18) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 18),I= 19, 24) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ + DATA (CF(I),I=273,279) /512,-20,-2,-2,16,16,-128/ C 1 T(5,6,2,1,3,4) - DATA (CF(I, 19),I= 1, 6) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 19),I= 7, 12) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 19),I= 13, 18) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 19),I= 19, 24) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ + DATA (CF(I),I=280,285) /512,-128,-128,16,16,160/ C 1 T(6,1,2,5,3,4) - DATA (CF(I, 20),I= 1, 6) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 20),I= 7, 12) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 20),I= 13, 18) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 20),I= 19, 24) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ + DATA (CF(I),I=286,290) /512,16,160,-128,16/ C 1 T(6,1,5,2,3,4) - DATA (CF(I, 21),I= 1, 6) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 21),I= 7, 12) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 21),I= 13, 18) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 21),I= 19, 24) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ + DATA (CF(I),I=291,294) /512,-128,160,16/ C 1 T(6,2,1,5,3,4) - DATA (CF(I, 22),I= 1, 6) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 22),I= 7, 12) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 22),I= 13, 18) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 22),I= 19, 24) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ + DATA (CF(I),I=295,297) /512,16,-128/ C 1 T(6,2,5,1,3,4) - DATA (CF(I, 23),I= 1, 6) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 23),I= 7, 12) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 23),I= 13, 18) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 23),I= 19, 24) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ + DATA (CF(I),I=298,299) /512,-128/ C 1 T(6,5,1,2,3,4) - DATA (CF(I, 24),I= 1, 6) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 24),I= 7, 12) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 24),I= 13, 18) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 24),I= 19, 24) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ + DATA (CF(I),I=300,300) /512/ C 1 T(6,5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -1547,10 +1222,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -1559,6 +1236,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(2)=AMP2(2)+AMP(4)*DCONJG(AMP(4)) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/madevent b/epochX/cudacpp/gg_ttgg.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/madevent +++ b/epochX/cudacpp/gg_ttgg.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h index 53dd560ed6..c30f753dcb 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -963,7 +963,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -976,7 +976,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -990,7 +990,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1003,7 +1003,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1017,7 +1017,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -1030,7 +1030,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1053,7 +1053,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1066,7 +1066,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1091,7 +1091,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1104,7 +1104,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1116,7 +1116,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1129,7 +1129,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1148,7 +1148,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1161,7 +1161,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1180,7 +1180,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1193,7 +1193,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1211,7 +1211,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -1226,7 +1226,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] ); @@ -1241,7 +1241,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1256,7 +1256,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1276,7 +1276,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -1291,7 +1291,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1306,7 +1306,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -1321,7 +1321,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1341,7 +1341,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1356,7 +1356,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1371,7 +1371,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1386,7 +1386,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc index 47a3a011b8..fd5642f3e3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index 76066c7bb1..f4b086fc96 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -310,7 +310,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -320,12 +320,12 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; GC_12s_sv = couplings_sv.GC_12; diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 7e5a3007eb..a55f92c773 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0059185028076171875  +DEBUG: model prefixing takes 0.005396842956542969  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.156 s +1 processes with 123 diagrams generated in 0.158 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.422 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.316 s +ALOHA: aloha creates 5 routines in 0.315 s VVV1 VVV1 FFV1 @@ -190,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.316 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.455s -user 0m1.362s +real 0m1.450s +user 0m1.371s sys 0m0.060s -Code generation completed in 1 seconds +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index 5956559974..5e1fba0c34 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 24; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,2469 +279,379 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 123 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 1 - VVVV1_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 123 *** - - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 123 *** - - // Wavefunction(s) for diagram number 3 - VVV1P0_1( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 123 *** - - // Wavefunction(s) for diagram number 4 - VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 4 - VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 123 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 123 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 7 OF 123 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 8 OF 123 *** - - // Wavefunction(s) for diagram number 8 - FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 123 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 10 OF 123 *** - - // Wavefunction(s) for diagram number 10 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 123 *** - - // Wavefunction(s) for diagram number 11 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 123 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 13 OF 123 *** - - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 14 OF 123 *** - - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 15 OF 123 *** - - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - - // *** DIAGRAM 16 OF 123 *** - - // Wavefunction(s) for diagram number 16 - // (none) - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 17 OF 123 *** - - // Wavefunction(s) for diagram number 17 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_1( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 18 OF 123 *** - - // Wavefunction(s) for diagram number 18 - FFV1_1( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 18 - FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 19 OF 123 *** - - // Wavefunction(s) for diagram number 19 - // (none) - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 20 OF 123 *** - - // Wavefunction(s) for diagram number 20 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 21 OF 123 *** - - // Wavefunction(s) for diagram number 21 - // (none) - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 22 OF 123 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 23 OF 123 *** - - // Wavefunction(s) for diagram number 23 - VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] ); - - // Amplitude(s) for diagram number 23 - VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 24 OF 123 *** - - // Wavefunction(s) for diagram number 24 - // (none) - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 25 OF 123 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 26 OF 123 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 27 OF 123 *** - - // Wavefunction(s) for diagram number 27 - // (none) - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 28 OF 123 *** - - // Wavefunction(s) for diagram number 28 - // (none) - - // Amplitude(s) for diagram number 28 - FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 29 OF 123 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 30 OF 123 *** - - // Wavefunction(s) for diagram number 30 - // (none) - - // Amplitude(s) for diagram number 30 - FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 31 OF 123 *** - - // Wavefunction(s) for diagram number 31 - // (none) - - // Amplitude(s) for diagram number 31 - VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 32 OF 123 *** - - // Wavefunction(s) for diagram number 32 - VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); - VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] ); - VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 33 OF 123 *** - - // Wavefunction(s) for diagram number 33 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 34 OF 123 *** - // Wavefunction(s) for diagram number 34 - FFV1_2( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 34 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - - // *** DIAGRAM 35 OF 123 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 36 OF 123 *** - - // Wavefunction(s) for diagram number 36 - FFV1P0_3( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 36 - VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 37 OF 123 *** - - // Wavefunction(s) for diagram number 37 - // (none) - - // Amplitude(s) for diagram number 37 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 38 OF 123 *** - - // Wavefunction(s) for diagram number 38 - // (none) - - // Amplitude(s) for diagram number 38 - FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 39 OF 123 *** - - // Wavefunction(s) for diagram number 39 - // (none) - - // Amplitude(s) for diagram number 39 - VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 40 OF 123 *** - - // Wavefunction(s) for diagram number 40 - // (none) - - // Amplitude(s) for diagram number 40 - FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 41 OF 123 *** - - // Wavefunction(s) for diagram number 41 - // (none) - - // Amplitude(s) for diagram number 41 - FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 42 OF 123 *** - - // Wavefunction(s) for diagram number 42 - FFV1_2( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 42 - FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 43 OF 123 *** - - // Wavefunction(s) for diagram number 43 - // (none) - - // Amplitude(s) for diagram number 43 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] -= amp_sv[0]; - - // *** DIAGRAM 44 OF 123 *** - - // Wavefunction(s) for diagram number 44 - // (none) - - // Amplitude(s) for diagram number 44 - FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 45 OF 123 *** - - // Wavefunction(s) for diagram number 45 - // (none) - - // Amplitude(s) for diagram number 45 - FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 46 OF 123 *** - - // Wavefunction(s) for diagram number 46 - // (none) - - // Amplitude(s) for diagram number 46 - FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 47 OF 123 *** - - // Wavefunction(s) for diagram number 47 - // (none) - - // Amplitude(s) for diagram number 47 - VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 48 OF 123 *** - - // Wavefunction(s) for diagram number 48 - // (none) - - // Amplitude(s) for diagram number 48 - FFV1_0( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 49 OF 123 *** - - // Wavefunction(s) for diagram number 49 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] ); - FFV1_2( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); - - // Amplitude(s) for diagram number 49 - FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 50 OF 123 *** - - // Wavefunction(s) for diagram number 50 - VVV1P0_1( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 50 - FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 51 OF 123 *** - - // Wavefunction(s) for diagram number 51 - // (none) - - // Amplitude(s) for diagram number 51 - FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 52 OF 123 *** - - // Wavefunction(s) for diagram number 52 - FFV1_1( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 52 - FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 53 OF 123 *** - - // Wavefunction(s) for diagram number 53 - // (none) - - // Amplitude(s) for diagram number 53 - FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 54 OF 123 *** - - // Wavefunction(s) for diagram number 54 - // (none) - - // Amplitude(s) for diagram number 54 - FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 55 OF 123 *** - - // Wavefunction(s) for diagram number 55 - // (none) - - // Amplitude(s) for diagram number 55 - FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - - // *** DIAGRAM 56 OF 123 *** - - // Wavefunction(s) for diagram number 56 - // (none) - - // Amplitude(s) for diagram number 56 - FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - - // *** DIAGRAM 57 OF 123 *** - - // Wavefunction(s) for diagram number 57 - // (none) - - // Amplitude(s) for diagram number 57 - VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 58 OF 123 *** - - // Wavefunction(s) for diagram number 58 - // (none) - - // Amplitude(s) for diagram number 58 - VVVV1_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 59 OF 123 *** - - // Wavefunction(s) for diagram number 59 - VVV1P0_1( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 59 - VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 60 OF 123 *** - - // Wavefunction(s) for diagram number 60 - // (none) - - // Amplitude(s) for diagram number 60 - VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 61 OF 123 *** - - // Wavefunction(s) for diagram number 61 - // (none) - - // Amplitude(s) for diagram number 61 - FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 62 OF 123 *** - - // Wavefunction(s) for diagram number 62 - // (none) - - // Amplitude(s) for diagram number 62 - FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 63 OF 123 *** - - // Wavefunction(s) for diagram number 63 - // (none) - - // Amplitude(s) for diagram number 63 - FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 64 OF 123 *** - - // Wavefunction(s) for diagram number 64 - // (none) - - // Amplitude(s) for diagram number 64 - FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 65 OF 123 *** - - // Wavefunction(s) for diagram number 65 - VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); - FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 65 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 66 OF 123 *** - - // Wavefunction(s) for diagram number 66 - VVV1P0_1( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 66 - FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 67 OF 123 *** - - // Wavefunction(s) for diagram number 67 - // (none) - - // Amplitude(s) for diagram number 67 - FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 68 OF 123 *** - - // Wavefunction(s) for diagram number 68 - FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 68 - FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 69 OF 123 *** - - // Wavefunction(s) for diagram number 69 - // (none) - - // Amplitude(s) for diagram number 69 - FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 70 OF 123 *** - - // Wavefunction(s) for diagram number 70 - // (none) - - // Amplitude(s) for diagram number 70 - FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 71 OF 123 *** - - // Wavefunction(s) for diagram number 71 - // (none) - - // Amplitude(s) for diagram number 71 - FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - - // *** DIAGRAM 72 OF 123 *** - - // Wavefunction(s) for diagram number 72 - // (none) - - // Amplitude(s) for diagram number 72 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - - // *** DIAGRAM 73 OF 123 *** - - // Wavefunction(s) for diagram number 73 - // (none) - - // Amplitude(s) for diagram number 73 - VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 74 OF 123 *** - - // Wavefunction(s) for diagram number 74 - // (none) - - // Amplitude(s) for diagram number 74 - VVVV1_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 75 OF 123 *** - - // Wavefunction(s) for diagram number 75 - VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 75 - VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 76 OF 123 *** - - // Wavefunction(s) for diagram number 76 - // (none) - - // Amplitude(s) for diagram number 76 - VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 77 OF 123 *** - - // Wavefunction(s) for diagram number 77 - // (none) - - // Amplitude(s) for diagram number 77 - FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 78 OF 123 *** - - // Wavefunction(s) for diagram number 78 - // (none) - - // Amplitude(s) for diagram number 78 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 79 OF 123 *** - - // Wavefunction(s) for diagram number 79 - // (none) - - // Amplitude(s) for diagram number 79 - FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 80 OF 123 *** - - // Wavefunction(s) for diagram number 80 - // (none) - - // Amplitude(s) for diagram number 80 - FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 81 OF 123 *** - - // Wavefunction(s) for diagram number 81 - FFV1_1( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 81 - FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - - // *** DIAGRAM 82 OF 123 *** - - // Wavefunction(s) for diagram number 82 - FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 82 - FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 83 OF 123 *** - - // Wavefunction(s) for diagram number 83 - // (none) - - // Amplitude(s) for diagram number 83 - FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - - // *** DIAGRAM 84 OF 123 *** - - // Wavefunction(s) for diagram number 84 - FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 84 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - - // *** DIAGRAM 85 OF 123 *** - - // Wavefunction(s) for diagram number 85 - // (none) - - // Amplitude(s) for diagram number 85 - FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 86 OF 123 *** - - // Wavefunction(s) for diagram number 86 - VVV1P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 86 - FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 87 OF 123 *** - - // Wavefunction(s) for diagram number 87 - FFV1_2( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); - - // Amplitude(s) for diagram number 87 - FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 88 OF 123 *** - - // Wavefunction(s) for diagram number 88 - FFV1_1( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 88 - FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - - // *** DIAGRAM 89 OF 123 *** - - // Wavefunction(s) for diagram number 89 - // (none) - - // Amplitude(s) for diagram number 89 - FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 90 OF 123 *** - - // Wavefunction(s) for diagram number 90 - FFV1_1( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); - - // Amplitude(s) for diagram number 90 - FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 91 OF 123 *** - - // Wavefunction(s) for diagram number 91 - // (none) - - // Amplitude(s) for diagram number 91 - FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 92 OF 123 *** - - // Wavefunction(s) for diagram number 92 - // (none) - - // Amplitude(s) for diagram number 92 - FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 93 OF 123 *** - - // Wavefunction(s) for diagram number 93 - // (none) - - // Amplitude(s) for diagram number 93 - VVVV1_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 94 OF 123 *** - - // Wavefunction(s) for diagram number 94 - VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 94 - VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 95 OF 123 *** - - // Wavefunction(s) for diagram number 95 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] ); - - // Amplitude(s) for diagram number 95 - VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 96 OF 123 *** - - // Wavefunction(s) for diagram number 96 - // (none) - - // Amplitude(s) for diagram number 96 - FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 97 OF 123 *** - - // Wavefunction(s) for diagram number 97 - // (none) - - // Amplitude(s) for diagram number 97 - FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 98 OF 123 *** - - // Wavefunction(s) for diagram number 98 - // (none) - - // Amplitude(s) for diagram number 98 - FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 99 OF 123 *** - - // Wavefunction(s) for diagram number 99 - // (none) - - // Amplitude(s) for diagram number 99 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 100 OF 123 *** - - // Wavefunction(s) for diagram number 100 - // (none) - - // Amplitude(s) for diagram number 100 - VVVV1_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 101 OF 123 *** - - // Wavefunction(s) for diagram number 101 - VVV1P0_1( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 101 - VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 102 OF 123 *** - - // Wavefunction(s) for diagram number 102 - // (none) - - // Amplitude(s) for diagram number 102 - VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 103 OF 123 *** - - // Wavefunction(s) for diagram number 103 - // (none) - - // Amplitude(s) for diagram number 103 - FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 104 OF 123 *** - - // Wavefunction(s) for diagram number 104 - // (none) - - // Amplitude(s) for diagram number 104 - FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 105 OF 123 *** - - // Wavefunction(s) for diagram number 105 - // (none) - - // Amplitude(s) for diagram number 105 - FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 106 OF 123 *** - - // Wavefunction(s) for diagram number 106 - // (none) - - // Amplitude(s) for diagram number 106 - FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 107 OF 123 *** - - // Wavefunction(s) for diagram number 107 - // (none) - - // Amplitude(s) for diagram number 107 - VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 108 OF 123 *** - - // Wavefunction(s) for diagram number 108 - // (none) - - // Amplitude(s) for diagram number 108 - VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 109 OF 123 *** - - // Wavefunction(s) for diagram number 109 - // (none) - - // Amplitude(s) for diagram number 109 - VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 110 OF 123 *** - - // Wavefunction(s) for diagram number 110 - // (none) - - // Amplitude(s) for diagram number 110 - FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 111 OF 123 *** - - // Wavefunction(s) for diagram number 111 - // (none) - - // Amplitude(s) for diagram number 111 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 112 OF 123 *** - - // Wavefunction(s) for diagram number 112 - // (none) - - // Amplitude(s) for diagram number 112 - FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 113 OF 123 *** - - // Wavefunction(s) for diagram number 113 - // (none) - - // Amplitude(s) for diagram number 113 - FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 114 OF 123 *** - - // Wavefunction(s) for diagram number 114 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 114 - VVV1_0( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 115 OF 123 *** - - // Wavefunction(s) for diagram number 115 - // (none) - - // Amplitude(s) for diagram number 115 - FFV1_0( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 116 OF 123 *** - - // Wavefunction(s) for diagram number 116 - // (none) - - // Amplitude(s) for diagram number 116 - FFV1_0( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - FFV1_0( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 117 OF 123 *** - - // Wavefunction(s) for diagram number 117 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 117 - VVV1_0( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 118 OF 123 *** - - // Wavefunction(s) for diagram number 118 - // (none) - - // Amplitude(s) for diagram number 118 - FFV1_0( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 119 OF 123 *** - - // Wavefunction(s) for diagram number 119 - // (none) - - // Amplitude(s) for diagram number 119 - FFV1_0( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - FFV1_0( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - FFV1_0( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 120 OF 123 *** - - // Wavefunction(s) for diagram number 120 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); - - // Amplitude(s) for diagram number 120 - FFV1_0( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 121 OF 123 *** - - // Wavefunction(s) for diagram number 121 - // (none) - - // Amplitude(s) for diagram number 121 - FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - FFV1_0( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 122 OF 123 *** - - // Wavefunction(s) for diagram number 122 - // (none) - - // Amplitude(s) for diagram number 122 - VVV1_0( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 123 OF 123 *** - - // Wavefunction(s) for diagram number 123 - // (none) - - // Amplitude(s) for diagram number 123 - VVV1_0( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxgg()?) - - // The color denominators (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] - - // The color matrix (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, - { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, - { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, - { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, - { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, - { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, - { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, - { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, - { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, - { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, - { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, - { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, - { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, - { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, - { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, - { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, - { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, - { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, - { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, - { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, - { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, - { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, - { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, - { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 123 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -2775,7 +738,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -2810,6 +777,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -2852,6 +823,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -2954,26 +929,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -2981,25 +956,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -3144,13 +1323,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -3162,17 +1335,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -3198,93 +1374,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3326,7 +1472,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -3349,7 +1495,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -3358,25 +1504,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -3386,8 +1538,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -3403,11 +1557,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -3509,14 +1664,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 2b75e0f842..f142e7ef7d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 159; //static const int ncomb = 64; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc new file mode 100644 index 0000000000..02db3d0204 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc @@ -0,0 +1,405 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] + + // The color matrix (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, + { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, + { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, + { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, + { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, + { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, + { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, + { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, + { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, + { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, + { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, + { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, + { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, + { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, + { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, + { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, + { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, + { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, + { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, + { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, + { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, + { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, + { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, + { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagrams.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagrams.h new file mode 100644 index 0000000000..b857887951 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagrams.h @@ -0,0 +1,4177 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 123 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 1 + VVVV1_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 123 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 123 *** + // Wavefunction(s) for diagram number 3 + VVV1P0_1( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 123 *** + // Wavefunction(s) for diagram number 4 + VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 4 + VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 123 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 123 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 123 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 123 *** + // Wavefunction(s) for diagram number 8 + FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 123 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 123 *** + // Wavefunction(s) for diagram number 10 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 123 *** + // Wavefunction(s) for diagram number 11 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 123 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 123 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 123 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 123 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 123 *** + // Wavefunction(s) for diagram number 16 + // (none) + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 123 *** + // Wavefunction(s) for diagram number 17 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_1( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 123 *** + // Wavefunction(s) for diagram number 18 + FFV1_1( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 18 + FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 123 *** + // Wavefunction(s) for diagram number 19 + // (none) + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 123 *** + // Wavefunction(s) for diagram number 20 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 123 *** + // Wavefunction(s) for diagram number 21 + // (none) + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 123 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 123 *** + // Wavefunction(s) for diagram number 23 + VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] ); + // Amplitude(s) for diagram number 23 + VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 123 *** + // Wavefunction(s) for diagram number 24 + // (none) + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 123 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 123 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 123 *** + // Wavefunction(s) for diagram number 27 + // (none) + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 123 *** + // Wavefunction(s) for diagram number 28 + // (none) + // Amplitude(s) for diagram number 28 + FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 123 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 123 *** + // Wavefunction(s) for diagram number 30 + // (none) + // Amplitude(s) for diagram number 30 + FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 123 *** + // Wavefunction(s) for diagram number 31 + // (none) + // Amplitude(s) for diagram number 31 + VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 123 *** + // Wavefunction(s) for diagram number 32 + VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); + VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] ); + VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 123 *** + // Wavefunction(s) for diagram number 33 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 123 *** + // Wavefunction(s) for diagram number 34 + FFV1_2( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 34 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 123 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 123 *** + // Wavefunction(s) for diagram number 36 + FFV1P0_3( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 36 + VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram37( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 37 OF 123 *** + // Wavefunction(s) for diagram number 37 + // (none) + // Amplitude(s) for diagram number 37 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram38( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 38 OF 123 *** + // Wavefunction(s) for diagram number 38 + // (none) + // Amplitude(s) for diagram number 38 + FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram39( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 39 OF 123 *** + // Wavefunction(s) for diagram number 39 + // (none) + // Amplitude(s) for diagram number 39 + VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram40( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 40 OF 123 *** + // Wavefunction(s) for diagram number 40 + // (none) + // Amplitude(s) for diagram number 40 + FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram41( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 41 OF 123 *** + // Wavefunction(s) for diagram number 41 + // (none) + // Amplitude(s) for diagram number 41 + FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram42( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 42 OF 123 *** + // Wavefunction(s) for diagram number 42 + FFV1_2( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 42 + FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram43( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 43 OF 123 *** + // Wavefunction(s) for diagram number 43 + // (none) + // Amplitude(s) for diagram number 43 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram44( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 44 OF 123 *** + // Wavefunction(s) for diagram number 44 + // (none) + // Amplitude(s) for diagram number 44 + FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram45( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 45 OF 123 *** + // Wavefunction(s) for diagram number 45 + // (none) + // Amplitude(s) for diagram number 45 + FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram46( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 46 OF 123 *** + // Wavefunction(s) for diagram number 46 + // (none) + // Amplitude(s) for diagram number 46 + FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram47( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 47 OF 123 *** + // Wavefunction(s) for diagram number 47 + // (none) + // Amplitude(s) for diagram number 47 + VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram48( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 48 OF 123 *** + // Wavefunction(s) for diagram number 48 + // (none) + // Amplitude(s) for diagram number 48 + FFV1_0( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram49( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 49 OF 123 *** + // Wavefunction(s) for diagram number 49 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] ); + FFV1_2( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); + // Amplitude(s) for diagram number 49 + FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram50( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 50 OF 123 *** + // Wavefunction(s) for diagram number 50 + VVV1P0_1( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 50 + FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram51( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 51 OF 123 *** + // Wavefunction(s) for diagram number 51 + // (none) + // Amplitude(s) for diagram number 51 + FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram52( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 52 OF 123 *** + // Wavefunction(s) for diagram number 52 + FFV1_1( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 52 + FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram53( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 53 OF 123 *** + // Wavefunction(s) for diagram number 53 + // (none) + // Amplitude(s) for diagram number 53 + FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram54( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 54 OF 123 *** + // Wavefunction(s) for diagram number 54 + // (none) + // Amplitude(s) for diagram number 54 + FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram55( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 55 OF 123 *** + // Wavefunction(s) for diagram number 55 + // (none) + // Amplitude(s) for diagram number 55 + FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram56( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 56 OF 123 *** + // Wavefunction(s) for diagram number 56 + // (none) + // Amplitude(s) for diagram number 56 + FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram57( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 57 OF 123 *** + // Wavefunction(s) for diagram number 57 + // (none) + // Amplitude(s) for diagram number 57 + VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram58( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 58 OF 123 *** + // Wavefunction(s) for diagram number 58 + // (none) + // Amplitude(s) for diagram number 58 + VVVV1_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram59( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 59 OF 123 *** + // Wavefunction(s) for diagram number 59 + VVV1P0_1( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 59 + VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram60( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 60 OF 123 *** + // Wavefunction(s) for diagram number 60 + // (none) + // Amplitude(s) for diagram number 60 + VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram61( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 61 OF 123 *** + // Wavefunction(s) for diagram number 61 + // (none) + // Amplitude(s) for diagram number 61 + FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram62( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 62 OF 123 *** + // Wavefunction(s) for diagram number 62 + // (none) + // Amplitude(s) for diagram number 62 + FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram63( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 63 OF 123 *** + // Wavefunction(s) for diagram number 63 + // (none) + // Amplitude(s) for diagram number 63 + FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram64( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 64 OF 123 *** + // Wavefunction(s) for diagram number 64 + // (none) + // Amplitude(s) for diagram number 64 + FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram65( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 65 OF 123 *** + // Wavefunction(s) for diagram number 65 + VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); + FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 65 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram66( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 66 OF 123 *** + // Wavefunction(s) for diagram number 66 + VVV1P0_1( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 66 + FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram67( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 67 OF 123 *** + // Wavefunction(s) for diagram number 67 + // (none) + // Amplitude(s) for diagram number 67 + FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram68( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 68 OF 123 *** + // Wavefunction(s) for diagram number 68 + FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 68 + FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram69( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 69 OF 123 *** + // Wavefunction(s) for diagram number 69 + // (none) + // Amplitude(s) for diagram number 69 + FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram70( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 70 OF 123 *** + // Wavefunction(s) for diagram number 70 + // (none) + // Amplitude(s) for diagram number 70 + FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram71( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 71 OF 123 *** + // Wavefunction(s) for diagram number 71 + // (none) + // Amplitude(s) for diagram number 71 + FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram72( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 72 OF 123 *** + // Wavefunction(s) for diagram number 72 + // (none) + // Amplitude(s) for diagram number 72 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram73( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 73 OF 123 *** + // Wavefunction(s) for diagram number 73 + // (none) + // Amplitude(s) for diagram number 73 + VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram74( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 74 OF 123 *** + // Wavefunction(s) for diagram number 74 + // (none) + // Amplitude(s) for diagram number 74 + VVVV1_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram75( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 75 OF 123 *** + // Wavefunction(s) for diagram number 75 + VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 75 + VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram76( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 76 OF 123 *** + // Wavefunction(s) for diagram number 76 + // (none) + // Amplitude(s) for diagram number 76 + VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram77( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 77 OF 123 *** + // Wavefunction(s) for diagram number 77 + // (none) + // Amplitude(s) for diagram number 77 + FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram78( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 78 OF 123 *** + // Wavefunction(s) for diagram number 78 + // (none) + // Amplitude(s) for diagram number 78 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram79( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 79 OF 123 *** + // Wavefunction(s) for diagram number 79 + // (none) + // Amplitude(s) for diagram number 79 + FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram80( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 80 OF 123 *** + // Wavefunction(s) for diagram number 80 + // (none) + // Amplitude(s) for diagram number 80 + FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram81( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 81 OF 123 *** + // Wavefunction(s) for diagram number 81 + FFV1_1( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 81 + FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram82( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 82 OF 123 *** + // Wavefunction(s) for diagram number 82 + FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 82 + FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram83( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 83 OF 123 *** + // Wavefunction(s) for diagram number 83 + // (none) + // Amplitude(s) for diagram number 83 + FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram84( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 84 OF 123 *** + // Wavefunction(s) for diagram number 84 + FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 84 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram85( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 85 OF 123 *** + // Wavefunction(s) for diagram number 85 + // (none) + // Amplitude(s) for diagram number 85 + FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram86( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 86 OF 123 *** + // Wavefunction(s) for diagram number 86 + VVV1P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 86 + FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram87( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 87 OF 123 *** + // Wavefunction(s) for diagram number 87 + FFV1_2( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); + // Amplitude(s) for diagram number 87 + FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram88( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 88 OF 123 *** + // Wavefunction(s) for diagram number 88 + FFV1_1( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 88 + FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram89( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 89 OF 123 *** + // Wavefunction(s) for diagram number 89 + // (none) + // Amplitude(s) for diagram number 89 + FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram90( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 90 OF 123 *** + // Wavefunction(s) for diagram number 90 + FFV1_1( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); + // Amplitude(s) for diagram number 90 + FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram91( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 91 OF 123 *** + // Wavefunction(s) for diagram number 91 + // (none) + // Amplitude(s) for diagram number 91 + FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram92( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 92 OF 123 *** + // Wavefunction(s) for diagram number 92 + // (none) + // Amplitude(s) for diagram number 92 + FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram93( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 93 OF 123 *** + // Wavefunction(s) for diagram number 93 + // (none) + // Amplitude(s) for diagram number 93 + VVVV1_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram94( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 94 OF 123 *** + // Wavefunction(s) for diagram number 94 + VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 94 + VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram95( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 95 OF 123 *** + // Wavefunction(s) for diagram number 95 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] ); + // Amplitude(s) for diagram number 95 + VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram96( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 96 OF 123 *** + // Wavefunction(s) for diagram number 96 + // (none) + // Amplitude(s) for diagram number 96 + FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram97( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 97 OF 123 *** + // Wavefunction(s) for diagram number 97 + // (none) + // Amplitude(s) for diagram number 97 + FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram98( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 98 OF 123 *** + // Wavefunction(s) for diagram number 98 + // (none) + // Amplitude(s) for diagram number 98 + FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram99( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 99 OF 123 *** + // Wavefunction(s) for diagram number 99 + // (none) + // Amplitude(s) for diagram number 99 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram100( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 100 OF 123 *** + // Wavefunction(s) for diagram number 100 + // (none) + // Amplitude(s) for diagram number 100 + VVVV1_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram101( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 101 OF 123 *** + // Wavefunction(s) for diagram number 101 + VVV1P0_1( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 101 + VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram102( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 102 OF 123 *** + // Wavefunction(s) for diagram number 102 + // (none) + // Amplitude(s) for diagram number 102 + VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram103( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 103 OF 123 *** + // Wavefunction(s) for diagram number 103 + // (none) + // Amplitude(s) for diagram number 103 + FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram104( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 104 OF 123 *** + // Wavefunction(s) for diagram number 104 + // (none) + // Amplitude(s) for diagram number 104 + FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram105( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 105 OF 123 *** + // Wavefunction(s) for diagram number 105 + // (none) + // Amplitude(s) for diagram number 105 + FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram106( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 106 OF 123 *** + // Wavefunction(s) for diagram number 106 + // (none) + // Amplitude(s) for diagram number 106 + FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram107( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 107 OF 123 *** + // Wavefunction(s) for diagram number 107 + // (none) + // Amplitude(s) for diagram number 107 + VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram108( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 108 OF 123 *** + // Wavefunction(s) for diagram number 108 + // (none) + // Amplitude(s) for diagram number 108 + VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram109( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 109 OF 123 *** + // Wavefunction(s) for diagram number 109 + // (none) + // Amplitude(s) for diagram number 109 + VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram110( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 110 OF 123 *** + // Wavefunction(s) for diagram number 110 + // (none) + // Amplitude(s) for diagram number 110 + FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram111( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 111 OF 123 *** + // Wavefunction(s) for diagram number 111 + // (none) + // Amplitude(s) for diagram number 111 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram112( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 112 OF 123 *** + // Wavefunction(s) for diagram number 112 + // (none) + // Amplitude(s) for diagram number 112 + FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram113( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 113 OF 123 *** + // Wavefunction(s) for diagram number 113 + // (none) + // Amplitude(s) for diagram number 113 + FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram114( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 114 OF 123 *** + // Wavefunction(s) for diagram number 114 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 114 + VVV1_0( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram115( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 115 OF 123 *** + // Wavefunction(s) for diagram number 115 + // (none) + // Amplitude(s) for diagram number 115 + FFV1_0( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram116( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 116 OF 123 *** + // Wavefunction(s) for diagram number 116 + // (none) + // Amplitude(s) for diagram number 116 + FFV1_0( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + FFV1_0( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram117( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 117 OF 123 *** + // Wavefunction(s) for diagram number 117 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 117 + VVV1_0( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram118( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 118 OF 123 *** + // Wavefunction(s) for diagram number 118 + // (none) + // Amplitude(s) for diagram number 118 + FFV1_0( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram119( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 119 OF 123 *** + // Wavefunction(s) for diagram number 119 + // (none) + // Amplitude(s) for diagram number 119 + FFV1_0( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + FFV1_0( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + FFV1_0( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram120( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 120 OF 123 *** + // Wavefunction(s) for diagram number 120 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); + // Amplitude(s) for diagram number 120 + FFV1_0( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram121( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 121 OF 123 *** + // Wavefunction(s) for diagram number 121 + // (none) + // Amplitude(s) for diagram number 121 + FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + FFV1_0( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram122( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 122 OF 123 *** + // Wavefunction(s) for diagram number 122 + // (none) + // Amplitude(s) for diagram number 122 + VVV1_0( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram123( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 123 OF 123 *** + // Wavefunction(s) for diagram number 123 + // (none) + // Amplitude(s) for diagram number 123 + VVV1_0( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h index 53dd560ed6..c30f753dcb 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -963,7 +963,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -976,7 +976,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -990,7 +990,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1003,7 +1003,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1017,7 +1017,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -1030,7 +1030,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1053,7 +1053,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1066,7 +1066,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1091,7 +1091,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1104,7 +1104,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1116,7 +1116,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1129,7 +1129,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1148,7 +1148,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1161,7 +1161,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1180,7 +1180,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1193,7 +1193,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1211,7 +1211,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -1226,7 +1226,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] ); @@ -1241,7 +1241,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1256,7 +1256,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1276,7 +1276,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -1291,7 +1291,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1306,7 +1306,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -1321,7 +1321,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1341,7 +1341,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1356,7 +1356,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1371,7 +1371,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1386,7 +1386,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc index 47a3a011b8..fd5642f3e3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index 76066c7bb1..f4b086fc96 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -310,7 +310,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -320,12 +320,12 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; GC_12s_sv = couplings_sv.GC_12; diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 1afa1ab2a5..50bf1bb4e8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005990028381347656  +DEBUG: model prefixing takes 0.005486965179443359  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,27 +151,27 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.890 s +1 processes with 1240 diagrams generated in 1.870 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 7s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -179,25 +179,25 @@ FileWriter t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.565 s -Wrote files for 2281 helas calls in 18.614 s +DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1665]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.505 s +Wrote files for 2281 helas calls in 17.781 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.373 s +ALOHA: aloha creates 5 routines in 0.313 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.313 s +ALOHA: aloha creates 10 routines in 0.356 s VVV1 VVV1 FFV1 @@ -210,38 +210,38 @@ ALOHA: aloha creates 10 routines in 0.313 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 339 (offset 112 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m33.065s -user 0m32.263s +real 0m32.103s +user 0m31.529s sys 0m0.459s -Code generation completed in 33 seconds +Code generation completed in 32 seconds ************************************************************ * * * W E L C O M E to * @@ -254,7 +254,7 @@ Code generation completed in 33 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -262,9 +262,9 @@ Code generation completed in 33 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -284,7 +284,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -292,9 +292,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat index cdd9d43b05..e6a9fc4dae 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat index a08f93d92b..596243d42e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat index 48050a5fd7..377d5bc1c7 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts +++ b/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc +++ b/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/makefile b/epochX/cudacpp/gg_ttggg.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/makefile +++ b/epochX/cudacpp/gg_ttggg.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc +++ b/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index ba06f6ff44..6b4b8dc8ce 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 120; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,30013 +279,2613 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 121; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 1240 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - vxxxxx( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - VVV1P0_1( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] ); - VVV1P0_1( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 1 - VVV1_0( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 2 OF 1240 *** - - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 2 - VVV1_0( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 3 OF 1240 *** - - // Wavefunction(s) for diagram number 3 - // (none) - - // Amplitude(s) for diagram number 3 - VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 4 OF 1240 *** - - // Wavefunction(s) for diagram number 4 - VVV1P0_1( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] ); - VVV1P0_1( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] ); - - // Amplitude(s) for diagram number 4 - VVV1_0( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 5 OF 1240 *** - - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 6 OF 1240 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 7 OF 1240 *** - - // Wavefunction(s) for diagram number 7 - VVV1P0_1( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 7 - VVV1_0( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 8 OF 1240 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - VVV1_0( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 9 OF 1240 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 10 OF 1240 *** - - // Wavefunction(s) for diagram number 10 - VVVV1P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); - VVVV3P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] ); - VVVV4P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); - - // Amplitude(s) for diagram number 10 - VVV1_0( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 11 OF 1240 *** - - // Wavefunction(s) for diagram number 11 - VVVV1P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] ); - VVVV3P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] ); - VVVV4P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] ); - - // Amplitude(s) for diagram number 11 - VVV1_0( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 12 OF 1240 *** - - // Wavefunction(s) for diagram number 12 - VVVV1P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] ); - VVVV4P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 12 - VVV1_0( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 13 OF 1240 *** - - // Wavefunction(s) for diagram number 13 - VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 13 - VVVV1_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 14 OF 1240 *** - - // Wavefunction(s) for diagram number 14 - VVV1P0_1( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] ); - - // Amplitude(s) for diagram number 14 - VVV1_0( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 15 OF 1240 *** - - // Wavefunction(s) for diagram number 15 - VVV1P0_1( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] ); - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 16 OF 1240 *** - - // Wavefunction(s) for diagram number 16 - // (none) - - // Amplitude(s) for diagram number 16 - VVV1_0( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 17 OF 1240 *** - - // Wavefunction(s) for diagram number 17 - VVV1P0_1( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] ); - - // Amplitude(s) for diagram number 17 - VVVV1_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - VVVV3_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVVV4_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 18 OF 1240 *** - - // Wavefunction(s) for diagram number 18 - // (none) - - // Amplitude(s) for diagram number 18 - VVV1_0( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 19 OF 1240 *** - - // Wavefunction(s) for diagram number 19 - VVV1P0_1( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] ); - - // Amplitude(s) for diagram number 19 - VVV1_0( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 20 OF 1240 *** - - // Wavefunction(s) for diagram number 20 - // (none) - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 21 OF 1240 *** - - // Wavefunction(s) for diagram number 21 - VVV1P0_1( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] ); - - // Amplitude(s) for diagram number 21 - VVVV1_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 22 OF 1240 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - VVV1_0( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 23 OF 1240 *** - - // Wavefunction(s) for diagram number 23 - // (none) - - // Amplitude(s) for diagram number 23 - VVV1_0( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 24 OF 1240 *** - - // Wavefunction(s) for diagram number 24 - VVV1P0_1( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] ); - - // Amplitude(s) for diagram number 24 - VVV1_0( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 25 OF 1240 *** - - // Wavefunction(s) for diagram number 25 - VVVV1P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] ); - VVVV3P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] ); - VVVV4P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] ); - - // Amplitude(s) for diagram number 25 - VVV1_0( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVV1_0( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 26 OF 1240 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] ); - FFV1_2( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); - FFV1_1( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 27 OF 1240 *** - - // Wavefunction(s) for diagram number 27 - FFV1_1( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] ); - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 28 OF 1240 *** - - // Wavefunction(s) for diagram number 28 - FFV1P0_3( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] ); - - // Amplitude(s) for diagram number 28 - VVV1_0( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 29 OF 1240 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 30 OF 1240 *** - - // Wavefunction(s) for diagram number 30 - // (none) - - // Amplitude(s) for diagram number 30 - VVV1_0( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 31 OF 1240 *** - - // Wavefunction(s) for diagram number 31 - // (none) - - // Amplitude(s) for diagram number 31 - FFV1_0( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - - // *** DIAGRAM 32 OF 1240 *** - - // Wavefunction(s) for diagram number 32 - // (none) - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 33 OF 1240 *** - - // Wavefunction(s) for diagram number 33 - FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] ); - FFV1_1( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] ); - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 34 OF 1240 *** - - // Wavefunction(s) for diagram number 34 - FFV1_2( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] ); - - // Amplitude(s) for diagram number 34 - FFV1_0( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 35 OF 1240 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - FFV1_0( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - - // *** DIAGRAM 36 OF 1240 *** - - // Wavefunction(s) for diagram number 36 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] ); - - // Amplitude(s) for diagram number 36 - FFV1_0( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 37 OF 1240 *** - - // Wavefunction(s) for diagram number 37 - FFV1_2( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] ); - - // Amplitude(s) for diagram number 37 - FFV1_0( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 38 OF 1240 *** - - // Wavefunction(s) for diagram number 38 - // (none) - - // Amplitude(s) for diagram number 38 - FFV1_0( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - - // *** DIAGRAM 39 OF 1240 *** - // Wavefunction(s) for diagram number 39 - // (none) - - // Amplitude(s) for diagram number 39 - FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - - // *** DIAGRAM 40 OF 1240 *** - - // Wavefunction(s) for diagram number 40 - // (none) - - // Amplitude(s) for diagram number 40 - FFV1_0( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 41 OF 1240 *** - - // Wavefunction(s) for diagram number 41 - // (none) - - // Amplitude(s) for diagram number 41 - FFV1_0( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 42 OF 1240 *** - - // Wavefunction(s) for diagram number 42 - FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] ); - FFV1_1( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] ); - - // Amplitude(s) for diagram number 42 - FFV1_0( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 43 OF 1240 *** - - // Wavefunction(s) for diagram number 43 - FFV1_1( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] ); - - // Amplitude(s) for diagram number 43 - FFV1_0( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 44 OF 1240 *** - - // Wavefunction(s) for diagram number 44 - FFV1P0_3( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] ); - - // Amplitude(s) for diagram number 44 - VVV1_0( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 45 OF 1240 *** - - // Wavefunction(s) for diagram number 45 - // (none) - - // Amplitude(s) for diagram number 45 - FFV1_0( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 46 OF 1240 *** - - // Wavefunction(s) for diagram number 46 - // (none) - - // Amplitude(s) for diagram number 46 - VVV1_0( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 47 OF 1240 *** - - // Wavefunction(s) for diagram number 47 - // (none) - - // Amplitude(s) for diagram number 47 - FFV1_0( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - - // *** DIAGRAM 48 OF 1240 *** - - // Wavefunction(s) for diagram number 48 - // (none) - - // Amplitude(s) for diagram number 48 - FFV1_0( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 49 OF 1240 *** - - // Wavefunction(s) for diagram number 49 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] ); - FFV1_1( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] ); - - // Amplitude(s) for diagram number 49 - FFV1_0( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 50 OF 1240 *** - - // Wavefunction(s) for diagram number 50 - FFV1_2( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] ); - - // Amplitude(s) for diagram number 50 - FFV1_0( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 51 OF 1240 *** - - // Wavefunction(s) for diagram number 51 - // (none) - - // Amplitude(s) for diagram number 51 - FFV1_0( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - - // *** DIAGRAM 52 OF 1240 *** - - // Wavefunction(s) for diagram number 52 - // (none) - - // Amplitude(s) for diagram number 52 - FFV1_0( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 53 OF 1240 *** - - // Wavefunction(s) for diagram number 53 - // (none) - - // Amplitude(s) for diagram number 53 - FFV1_0( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 54 OF 1240 *** - - // Wavefunction(s) for diagram number 54 - // (none) - - // Amplitude(s) for diagram number 54 - FFV1_0( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 55 OF 1240 *** - - // Wavefunction(s) for diagram number 55 - // (none) - - // Amplitude(s) for diagram number 55 - FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - - // *** DIAGRAM 56 OF 1240 *** - - // Wavefunction(s) for diagram number 56 - // (none) - - // Amplitude(s) for diagram number 56 - FFV1_0( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 57 OF 1240 *** - - // Wavefunction(s) for diagram number 57 - // (none) - - // Amplitude(s) for diagram number 57 - FFV1_0( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 58 OF 1240 *** - - // Wavefunction(s) for diagram number 58 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] ); - FFV1_1( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] ); - - // Amplitude(s) for diagram number 58 - FFV1_0( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 59 OF 1240 *** - - // Wavefunction(s) for diagram number 59 - FFV1_1( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] ); - - // Amplitude(s) for diagram number 59 - FFV1_0( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 60 OF 1240 *** - - // Wavefunction(s) for diagram number 60 - FFV1P0_3( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] ); - - // Amplitude(s) for diagram number 60 - VVV1_0( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 61 OF 1240 *** - - // Wavefunction(s) for diagram number 61 - // (none) - - // Amplitude(s) for diagram number 61 - FFV1_0( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 62 OF 1240 *** - - // Wavefunction(s) for diagram number 62 - // (none) - - // Amplitude(s) for diagram number 62 - VVV1_0( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 63 OF 1240 *** - - // Wavefunction(s) for diagram number 63 - // (none) - - // Amplitude(s) for diagram number 63 - FFV1_0( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 64 OF 1240 *** - - // Wavefunction(s) for diagram number 64 - // (none) - - // Amplitude(s) for diagram number 64 - FFV1_0( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 65 OF 1240 *** - - // Wavefunction(s) for diagram number 65 - FFV1_1( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); - - // Amplitude(s) for diagram number 65 - FFV1_0( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 66 OF 1240 *** - - // Wavefunction(s) for diagram number 66 - // (none) - - // Amplitude(s) for diagram number 66 - FFV1_0( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 67 OF 1240 *** - - // Wavefunction(s) for diagram number 67 - // (none) - - // Amplitude(s) for diagram number 67 - FFV1_0( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 68 OF 1240 *** - - // Wavefunction(s) for diagram number 68 - // (none) - - // Amplitude(s) for diagram number 68 - FFV1_0( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 69 OF 1240 *** - - // Wavefunction(s) for diagram number 69 - // (none) - - // Amplitude(s) for diagram number 69 - FFV1_0( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 70 OF 1240 *** - - // Wavefunction(s) for diagram number 70 - // (none) - - // Amplitude(s) for diagram number 70 - FFV1_0( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 71 OF 1240 *** - - // Wavefunction(s) for diagram number 71 - // (none) - - // Amplitude(s) for diagram number 71 - FFV1_0( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 72 OF 1240 *** - - // Wavefunction(s) for diagram number 72 - // (none) - - // Amplitude(s) for diagram number 72 - FFV1_0( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 73 OF 1240 *** - - // Wavefunction(s) for diagram number 73 - // (none) - - // Amplitude(s) for diagram number 73 - FFV1_0( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 74 OF 1240 *** - - // Wavefunction(s) for diagram number 74 - FFV1_1( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); - FFV1_2( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); - - // Amplitude(s) for diagram number 74 - FFV1_0( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 74 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 75 OF 1240 *** - - // Wavefunction(s) for diagram number 75 - FFV1_2( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] ); - - // Amplitude(s) for diagram number 75 - FFV1_0( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 76 OF 1240 *** - - // Wavefunction(s) for diagram number 76 - FFV1P0_3( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] ); - - // Amplitude(s) for diagram number 76 - VVV1_0( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 77 OF 1240 *** - - // Wavefunction(s) for diagram number 77 - // (none) - - // Amplitude(s) for diagram number 77 - FFV1_0( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - - // *** DIAGRAM 78 OF 1240 *** - - // Wavefunction(s) for diagram number 78 - // (none) - - // Amplitude(s) for diagram number 78 - VVV1_0( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 79 OF 1240 *** - - // Wavefunction(s) for diagram number 79 - // (none) - - // Amplitude(s) for diagram number 79 - FFV1_0( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 80 OF 1240 *** - - // Wavefunction(s) for diagram number 80 - // (none) - - // Amplitude(s) for diagram number 80 - FFV1_0( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 81 OF 1240 *** - - // Wavefunction(s) for diagram number 81 - // (none) - - // Amplitude(s) for diagram number 81 - FFV1_0( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - - // *** DIAGRAM 82 OF 1240 *** - - // Wavefunction(s) for diagram number 82 - // (none) - - // Amplitude(s) for diagram number 82 - FFV1_0( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 83 OF 1240 *** - - // Wavefunction(s) for diagram number 83 - // (none) - - // Amplitude(s) for diagram number 83 - FFV1_0( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 84 OF 1240 *** - - // Wavefunction(s) for diagram number 84 - FFV1_2( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] ); - - // Amplitude(s) for diagram number 84 - FFV1_0( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 85 OF 1240 *** - - // Wavefunction(s) for diagram number 85 - FFV1_2( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] ); - - // Amplitude(s) for diagram number 85 - FFV1_0( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 86 OF 1240 *** - - // Wavefunction(s) for diagram number 86 - FFV1P0_3( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 86 - VVV1_0( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 87 OF 1240 *** - - // Wavefunction(s) for diagram number 87 - // (none) - - // Amplitude(s) for diagram number 87 - FFV1_0( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - - // *** DIAGRAM 88 OF 1240 *** - - // Wavefunction(s) for diagram number 88 - // (none) - - // Amplitude(s) for diagram number 88 - VVV1_0( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 89 OF 1240 *** - - // Wavefunction(s) for diagram number 89 - // (none) - - // Amplitude(s) for diagram number 89 - FFV1_0( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - - // *** DIAGRAM 90 OF 1240 *** - - // Wavefunction(s) for diagram number 90 - // (none) - - // Amplitude(s) for diagram number 90 - FFV1_0( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 91 OF 1240 *** - - // Wavefunction(s) for diagram number 91 - // (none) - - // Amplitude(s) for diagram number 91 - FFV1_0( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - - // *** DIAGRAM 92 OF 1240 *** - - // Wavefunction(s) for diagram number 92 - // (none) - - // Amplitude(s) for diagram number 92 - FFV1_0( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 93 OF 1240 *** - - // Wavefunction(s) for diagram number 93 - // (none) - - // Amplitude(s) for diagram number 93 - FFV1_0( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 93 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 94 OF 1240 *** - - // Wavefunction(s) for diagram number 94 - FFV1_2( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] ); - - // Amplitude(s) for diagram number 94 - FFV1_0( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 95 OF 1240 *** - - // Wavefunction(s) for diagram number 95 - FFV1_2( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] ); - - // Amplitude(s) for diagram number 95 - FFV1_0( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 96 OF 1240 *** - - // Wavefunction(s) for diagram number 96 - FFV1P0_3( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] ); - - // Amplitude(s) for diagram number 96 - VVV1_0( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 97 OF 1240 *** - - // Wavefunction(s) for diagram number 97 - // (none) - - // Amplitude(s) for diagram number 97 - FFV1_0( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - - // *** DIAGRAM 98 OF 1240 *** - - // Wavefunction(s) for diagram number 98 - // (none) - - // Amplitude(s) for diagram number 98 - VVV1_0( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 99 OF 1240 *** - - // Wavefunction(s) for diagram number 99 - // (none) - - // Amplitude(s) for diagram number 99 - FFV1_0( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - - // *** DIAGRAM 100 OF 1240 *** - - // Wavefunction(s) for diagram number 100 - // (none) - - // Amplitude(s) for diagram number 100 - FFV1_0( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 101 OF 1240 *** - - // Wavefunction(s) for diagram number 101 - // (none) - - // Amplitude(s) for diagram number 101 - FFV1_0( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - - // *** DIAGRAM 102 OF 1240 *** - - // Wavefunction(s) for diagram number 102 - // (none) - - // Amplitude(s) for diagram number 102 - FFV1_0( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 103 OF 1240 *** - - // Wavefunction(s) for diagram number 103 - // (none) - - // Amplitude(s) for diagram number 103 - FFV1_0( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 104 OF 1240 *** - - // Wavefunction(s) for diagram number 104 - FFV1_2( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] ); - - // Amplitude(s) for diagram number 104 - FFV1_0( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - - // *** DIAGRAM 105 OF 1240 *** - - // Wavefunction(s) for diagram number 105 - VVV1P0_1( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] ); - - // Amplitude(s) for diagram number 105 - FFV1_0( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 106 OF 1240 *** - - // Wavefunction(s) for diagram number 106 - FFV1_1( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); - - // Amplitude(s) for diagram number 106 - FFV1_0( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - - // *** DIAGRAM 107 OF 1240 *** - - // Wavefunction(s) for diagram number 107 - // (none) - - // Amplitude(s) for diagram number 107 - FFV1_0( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 107 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 108 OF 1240 *** - - // Wavefunction(s) for diagram number 108 - // (none) - - // Amplitude(s) for diagram number 108 - FFV1_0( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 109 OF 1240 *** - - // Wavefunction(s) for diagram number 109 - // (none) - - // Amplitude(s) for diagram number 109 - FFV1_0( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 110 OF 1240 *** - - // Wavefunction(s) for diagram number 110 - FFV1_2( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 110 - FFV1_0( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - - // *** DIAGRAM 111 OF 1240 *** - - // Wavefunction(s) for diagram number 111 - VVV1P0_1( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] ); - - // Amplitude(s) for diagram number 111 - FFV1_0( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 112 OF 1240 *** - - // Wavefunction(s) for diagram number 112 - FFV1_1( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); - - // Amplitude(s) for diagram number 112 - FFV1_0( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 113 OF 1240 *** - - // Wavefunction(s) for diagram number 113 - // (none) - - // Amplitude(s) for diagram number 113 - FFV1_0( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 114 OF 1240 *** - - // Wavefunction(s) for diagram number 114 - // (none) - - // Amplitude(s) for diagram number 114 - FFV1_0( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 114 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 115 OF 1240 *** - - // Wavefunction(s) for diagram number 115 - // (none) - - // Amplitude(s) for diagram number 115 - FFV1_0( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 115 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 116 OF 1240 *** - - // Wavefunction(s) for diagram number 116 - FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 116 - FFV1_0( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 116 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - - // *** DIAGRAM 117 OF 1240 *** - - // Wavefunction(s) for diagram number 117 - VVV1P0_1( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] ); - - // Amplitude(s) for diagram number 117 - FFV1_0( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 117 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 118 OF 1240 *** - - // Wavefunction(s) for diagram number 118 - FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] ); - - // Amplitude(s) for diagram number 118 - FFV1_0( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 118 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 119 OF 1240 *** - - // Wavefunction(s) for diagram number 119 - // (none) - - // Amplitude(s) for diagram number 119 - FFV1_0( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 119 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 120 OF 1240 *** - - // Wavefunction(s) for diagram number 120 - // (none) - - // Amplitude(s) for diagram number 120 - FFV1_0( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 120 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 121 OF 1240 *** - - // Wavefunction(s) for diagram number 121 - // (none) - - // Amplitude(s) for diagram number 121 - FFV1_0( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 121 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 122 OF 1240 *** - - // Wavefunction(s) for diagram number 122 - // (none) - - // Amplitude(s) for diagram number 122 - FFV1_0( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 123 OF 1240 *** - - // Wavefunction(s) for diagram number 123 - // (none) - - // Amplitude(s) for diagram number 123 - FFV1_0( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 124 OF 1240 *** - - // Wavefunction(s) for diagram number 124 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); - FFV1_1( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); - - // Amplitude(s) for diagram number 124 - FFV1_0( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 124 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 125 OF 1240 *** - - // Wavefunction(s) for diagram number 125 - FFV1_2( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 125 - FFV1_0( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 125 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] -= amp_sv[0]; - - // *** DIAGRAM 126 OF 1240 *** - - // Wavefunction(s) for diagram number 126 - FFV1_1( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] ); - FFV1_2( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] ); - - // Amplitude(s) for diagram number 126 - FFV1_0( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 126 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 127 OF 1240 *** - - // Wavefunction(s) for diagram number 127 - // (none) - - // Amplitude(s) for diagram number 127 - FFV1_0( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 127 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] -= amp_sv[0]; - - // *** DIAGRAM 128 OF 1240 *** - - // Wavefunction(s) for diagram number 128 - FFV1_1( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] ); - - // Amplitude(s) for diagram number 128 - FFV1_0( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 128 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 129 OF 1240 *** - - // Wavefunction(s) for diagram number 129 - // (none) - - // Amplitude(s) for diagram number 129 - FFV1_0( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 129 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 130 OF 1240 *** - - // Wavefunction(s) for diagram number 130 - FFV1P0_3( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] ); - - // Amplitude(s) for diagram number 130 - VVV1_0( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 130 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 131 OF 1240 *** - - // Wavefunction(s) for diagram number 131 - FFV1_1( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); - - // Amplitude(s) for diagram number 131 - FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 131 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 132 OF 1240 *** - - // Wavefunction(s) for diagram number 132 - // (none) - - // Amplitude(s) for diagram number 132 - FFV1_0( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 132 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 133 OF 1240 *** - - // Wavefunction(s) for diagram number 133 - // (none) - - // Amplitude(s) for diagram number 133 - VVV1_0( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 133 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 134 OF 1240 *** - - // Wavefunction(s) for diagram number 134 - FFV1_1( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); - - // Amplitude(s) for diagram number 134 - FFV1_0( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 134 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 135 OF 1240 *** - - // Wavefunction(s) for diagram number 135 - // (none) - - // Amplitude(s) for diagram number 135 - FFV1_0( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 135 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 136 OF 1240 *** - - // Wavefunction(s) for diagram number 136 - // (none) - - // Amplitude(s) for diagram number 136 - VVV1_0( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 136 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 137 OF 1240 *** - - // Wavefunction(s) for diagram number 137 - // (none) - - // Amplitude(s) for diagram number 137 - FFV1_0( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 137 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 138 OF 1240 *** - - // Wavefunction(s) for diagram number 138 - FFV1_1( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] ); - - // Amplitude(s) for diagram number 138 - FFV1_0( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 138 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 139 OF 1240 *** - - // Wavefunction(s) for diagram number 139 - // (none) - - // Amplitude(s) for diagram number 139 - FFV1_0( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 140 OF 1240 *** - - // Wavefunction(s) for diagram number 140 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] ); - FFV1P0_3( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] ); - VVV1P0_1( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] ); - - // Amplitude(s) for diagram number 140 - VVV1_0( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 140 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 141 OF 1240 *** - - // Wavefunction(s) for diagram number 141 - VVV1P0_1( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] ); - - // Amplitude(s) for diagram number 141 - VVV1_0( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 141 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 142 OF 1240 *** - - // Wavefunction(s) for diagram number 142 - // (none) - - // Amplitude(s) for diagram number 142 - VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 143 OF 1240 *** - - // Wavefunction(s) for diagram number 143 - FFV1_2( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] ); - - // Amplitude(s) for diagram number 143 - FFV1_0( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 143 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 144 OF 1240 *** - - // Wavefunction(s) for diagram number 144 - // (none) - - // Amplitude(s) for diagram number 144 - FFV1_0( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 144 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 145 OF 1240 *** - - // Wavefunction(s) for diagram number 145 - // (none) - - // Amplitude(s) for diagram number 145 - FFV1_0( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 145 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 146 OF 1240 *** - - // Wavefunction(s) for diagram number 146 - // (none) - - // Amplitude(s) for diagram number 146 - FFV1_0( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 146 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 147 OF 1240 *** - - // Wavefunction(s) for diagram number 147 - FFV1_1( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] ); - - // Amplitude(s) for diagram number 147 - FFV1_0( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 147 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 148 OF 1240 *** - - // Wavefunction(s) for diagram number 148 - FFV1P0_3( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] ); - - // Amplitude(s) for diagram number 148 - VVV1_0( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 148 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 149 OF 1240 *** - - // Wavefunction(s) for diagram number 149 - // (none) - - // Amplitude(s) for diagram number 149 - FFV1_0( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 149 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 150 OF 1240 *** - - // Wavefunction(s) for diagram number 150 - // (none) - - // Amplitude(s) for diagram number 150 - FFV1_0( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 150 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 151 OF 1240 *** - - // Wavefunction(s) for diagram number 151 - FFV1P0_3( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] ); - - // Amplitude(s) for diagram number 151 - VVV1_0( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 151 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 152 OF 1240 *** - - // Wavefunction(s) for diagram number 152 - // (none) - - // Amplitude(s) for diagram number 152 - FFV1_0( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 152 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 153 OF 1240 *** - - // Wavefunction(s) for diagram number 153 - // (none) - - // Amplitude(s) for diagram number 153 - FFV1_0( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 153 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - - // *** DIAGRAM 154 OF 1240 *** - - // Wavefunction(s) for diagram number 154 - // (none) - - // Amplitude(s) for diagram number 154 - VVV1_0( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 154 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 155 OF 1240 *** - - // Wavefunction(s) for diagram number 155 - // (none) - - // Amplitude(s) for diagram number 155 - FFV1_0( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 155 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 156 OF 1240 *** - - // Wavefunction(s) for diagram number 156 - VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] ); - VVV1P0_1( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] ); - - // Amplitude(s) for diagram number 156 - VVV1_0( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 156 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 157 OF 1240 *** - - // Wavefunction(s) for diagram number 157 - VVV1P0_1( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] ); - - // Amplitude(s) for diagram number 157 - VVV1_0( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 157 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 158 OF 1240 *** - - // Wavefunction(s) for diagram number 158 - // (none) - - // Amplitude(s) for diagram number 158 - VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 159 OF 1240 *** - - // Wavefunction(s) for diagram number 159 - FFV1_2( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); - - // Amplitude(s) for diagram number 159 - FFV1_0( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 159 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 160 OF 1240 *** - - // Wavefunction(s) for diagram number 160 - // (none) - - // Amplitude(s) for diagram number 160 - FFV1_0( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 160 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 161 OF 1240 *** - - // Wavefunction(s) for diagram number 161 - // (none) - - // Amplitude(s) for diagram number 161 - FFV1_0( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 161 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 162 OF 1240 *** - - // Wavefunction(s) for diagram number 162 - // (none) - - // Amplitude(s) for diagram number 162 - FFV1_0( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 162 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 163 OF 1240 *** - - // Wavefunction(s) for diagram number 163 - FFV1_1( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] ); - - // Amplitude(s) for diagram number 163 - FFV1_0( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 163 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 164 OF 1240 *** - - // Wavefunction(s) for diagram number 164 - FFV1P0_3( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] ); - - // Amplitude(s) for diagram number 164 - VVV1_0( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 164 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 165 OF 1240 *** - - // Wavefunction(s) for diagram number 165 - // (none) - - // Amplitude(s) for diagram number 165 - FFV1_0( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 165 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 166 OF 1240 *** - - // Wavefunction(s) for diagram number 166 - // (none) - - // Amplitude(s) for diagram number 166 - FFV1_0( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 166 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 167 OF 1240 *** - - // Wavefunction(s) for diagram number 167 - // (none) - - // Amplitude(s) for diagram number 167 - VVV1_0( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 167 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 168 OF 1240 *** - - // Wavefunction(s) for diagram number 168 - // (none) - - // Amplitude(s) for diagram number 168 - FFV1_0( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 168 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 169 OF 1240 *** - - // Wavefunction(s) for diagram number 169 - // (none) - - // Amplitude(s) for diagram number 169 - FFV1_0( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 169 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - - // *** DIAGRAM 170 OF 1240 *** - - // Wavefunction(s) for diagram number 170 - // (none) - - // Amplitude(s) for diagram number 170 - VVV1_0( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 170 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 171 OF 1240 *** - - // Wavefunction(s) for diagram number 171 - // (none) - - // Amplitude(s) for diagram number 171 - FFV1_0( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 171 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - - // *** DIAGRAM 172 OF 1240 *** - - // Wavefunction(s) for diagram number 172 - VVV1P0_1( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] ); - VVV1P0_1( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] ); - - // Amplitude(s) for diagram number 172 - VVV1_0( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 172 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 173 OF 1240 *** - - // Wavefunction(s) for diagram number 173 - VVV1P0_1( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] ); - - // Amplitude(s) for diagram number 173 - VVV1_0( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 173 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 174 OF 1240 *** - - // Wavefunction(s) for diagram number 174 - // (none) - - // Amplitude(s) for diagram number 174 - VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 175 OF 1240 *** - - // Wavefunction(s) for diagram number 175 - FFV1_2( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] ); - - // Amplitude(s) for diagram number 175 - FFV1_0( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 175 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 176 OF 1240 *** - - // Wavefunction(s) for diagram number 176 - // (none) - - // Amplitude(s) for diagram number 176 - FFV1_0( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 176 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 177 OF 1240 *** - - // Wavefunction(s) for diagram number 177 - // (none) - - // Amplitude(s) for diagram number 177 - FFV1_0( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 177 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 178 OF 1240 *** - - // Wavefunction(s) for diagram number 178 - // (none) - - // Amplitude(s) for diagram number 178 - FFV1_0( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 178 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 179 OF 1240 *** - - // Wavefunction(s) for diagram number 179 - FFV1_1( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); - - // Amplitude(s) for diagram number 179 - FFV1_0( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 179 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 180 OF 1240 *** - - // Wavefunction(s) for diagram number 180 - // (none) - - // Amplitude(s) for diagram number 180 - VVV1_0( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 180 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 181 OF 1240 *** - - // Wavefunction(s) for diagram number 181 - // (none) - - // Amplitude(s) for diagram number 181 - FFV1_0( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 181 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 182 OF 1240 *** - - // Wavefunction(s) for diagram number 182 - // (none) - - // Amplitude(s) for diagram number 182 - FFV1_0( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 182 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 183 OF 1240 *** - - // Wavefunction(s) for diagram number 183 - // (none) - - // Amplitude(s) for diagram number 183 - VVV1_0( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 183 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 184 OF 1240 *** - - // Wavefunction(s) for diagram number 184 - // (none) - - // Amplitude(s) for diagram number 184 - FFV1_0( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 184 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 185 OF 1240 *** - - // Wavefunction(s) for diagram number 185 - // (none) - - // Amplitude(s) for diagram number 185 - FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 185 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - - // *** DIAGRAM 186 OF 1240 *** - - // Wavefunction(s) for diagram number 186 - // (none) - - // Amplitude(s) for diagram number 186 - VVV1_0( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 186 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 187 OF 1240 *** - - // Wavefunction(s) for diagram number 187 - // (none) - - // Amplitude(s) for diagram number 187 - FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 187 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - - // *** DIAGRAM 188 OF 1240 *** - - // Wavefunction(s) for diagram number 188 - FFV1_1( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); - - // Amplitude(s) for diagram number 188 - FFV1_0( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 188 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 189 OF 1240 *** - - // Wavefunction(s) for diagram number 189 - // (none) - - // Amplitude(s) for diagram number 189 - FFV1_0( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 189 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 190 OF 1240 *** - - // Wavefunction(s) for diagram number 190 - FFV1_2( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] ); - - // Amplitude(s) for diagram number 190 - FFV1_0( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 190 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 191 OF 1240 *** - - // Wavefunction(s) for diagram number 191 - // (none) - - // Amplitude(s) for diagram number 191 - FFV1_0( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 191 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] -= amp_sv[0]; - - // *** DIAGRAM 192 OF 1240 *** - - // Wavefunction(s) for diagram number 192 - // (none) - - // Amplitude(s) for diagram number 192 - FFV1_0( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 192 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 193 OF 1240 *** - - // Wavefunction(s) for diagram number 193 - // (none) - - // Amplitude(s) for diagram number 193 - FFV1_0( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 193 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 194 OF 1240 *** - - // Wavefunction(s) for diagram number 194 - // (none) - - // Amplitude(s) for diagram number 194 - FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 194 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 195 OF 1240 *** - - // Wavefunction(s) for diagram number 195 - // (none) - - // Amplitude(s) for diagram number 195 - VVV1_0( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 195 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 196 OF 1240 *** - - // Wavefunction(s) for diagram number 196 - // (none) - - // Amplitude(s) for diagram number 196 - FFV1_0( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 196 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 197 OF 1240 *** - - // Wavefunction(s) for diagram number 197 - // (none) - - // Amplitude(s) for diagram number 197 - FFV1_0( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 197 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 198 OF 1240 *** - - // Wavefunction(s) for diagram number 198 - // (none) - - // Amplitude(s) for diagram number 198 - FFV1_0( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 198 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 199 OF 1240 *** - - // Wavefunction(s) for diagram number 199 - FFV1_2( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] ); - - // Amplitude(s) for diagram number 199 - FFV1_0( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 199 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 200 OF 1240 *** - - // Wavefunction(s) for diagram number 200 - // (none) - - // Amplitude(s) for diagram number 200 - FFV1_0( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 200 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= amp_sv[0]; - - // *** DIAGRAM 201 OF 1240 *** - - // Wavefunction(s) for diagram number 201 - // (none) - - // Amplitude(s) for diagram number 201 - FFV1_0( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 201 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 202 OF 1240 *** - - // Wavefunction(s) for diagram number 202 - // (none) - - // Amplitude(s) for diagram number 202 - FFV1_0( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 202 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 203 OF 1240 *** - - // Wavefunction(s) for diagram number 203 - // (none) - - // Amplitude(s) for diagram number 203 - FFV1_0( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 203 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 204 OF 1240 *** - - // Wavefunction(s) for diagram number 204 - // (none) - - // Amplitude(s) for diagram number 204 - VVV1_0( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 204 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 205 OF 1240 *** - - // Wavefunction(s) for diagram number 205 - // (none) - - // Amplitude(s) for diagram number 205 - FFV1_0( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 205 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 206 OF 1240 *** - - // Wavefunction(s) for diagram number 206 - // (none) - - // Amplitude(s) for diagram number 206 - FFV1_0( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 206 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 207 OF 1240 *** - - // Wavefunction(s) for diagram number 207 - // (none) - - // Amplitude(s) for diagram number 207 - FFV1_0( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 207 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 208 OF 1240 *** - - // Wavefunction(s) for diagram number 208 - FFV1_2( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); - - // Amplitude(s) for diagram number 208 - FFV1_0( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 208 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= amp_sv[0]; - - // *** DIAGRAM 209 OF 1240 *** - - // Wavefunction(s) for diagram number 209 - // (none) - - // Amplitude(s) for diagram number 209 - FFV1_0( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 209 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= amp_sv[0]; - - // *** DIAGRAM 210 OF 1240 *** - - // Wavefunction(s) for diagram number 210 - // (none) - - // Amplitude(s) for diagram number 210 - FFV1_0( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 210 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 211 OF 1240 *** - - // Wavefunction(s) for diagram number 211 - // (none) - - // Amplitude(s) for diagram number 211 - FFV1_0( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 211 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 212 OF 1240 *** - - // Wavefunction(s) for diagram number 212 - // (none) - - // Amplitude(s) for diagram number 212 - FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 212 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 213 OF 1240 *** - - // Wavefunction(s) for diagram number 213 - // (none) - - // Amplitude(s) for diagram number 213 - VVV1_0( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 213 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 214 OF 1240 *** - - // Wavefunction(s) for diagram number 214 - // (none) - - // Amplitude(s) for diagram number 214 - FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 214 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 215 OF 1240 *** - - // Wavefunction(s) for diagram number 215 - // (none) - - // Amplitude(s) for diagram number 215 - FFV1_0( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 215 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 216 OF 1240 *** - - // Wavefunction(s) for diagram number 216 - // (none) - - // Amplitude(s) for diagram number 216 - FFV1_0( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 216 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 217 OF 1240 *** - - // Wavefunction(s) for diagram number 217 - VVV1P0_1( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] ); - - // Amplitude(s) for diagram number 217 - VVV1_0( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 217 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 218 OF 1240 *** - - // Wavefunction(s) for diagram number 218 - // (none) - - // Amplitude(s) for diagram number 218 - VVV1_0( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 218 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 219 OF 1240 *** - - // Wavefunction(s) for diagram number 219 - // (none) - - // Amplitude(s) for diagram number 219 - VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 220 OF 1240 *** - - // Wavefunction(s) for diagram number 220 - // (none) - - // Amplitude(s) for diagram number 220 - FFV1_0( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 220 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 221 OF 1240 *** - - // Wavefunction(s) for diagram number 221 - // (none) - - // Amplitude(s) for diagram number 221 - FFV1_0( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 221 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 222 OF 1240 *** - - // Wavefunction(s) for diagram number 222 - // (none) - - // Amplitude(s) for diagram number 222 - FFV1_0( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 222 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 223 OF 1240 *** - - // Wavefunction(s) for diagram number 223 - // (none) - - // Amplitude(s) for diagram number 223 - FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 223 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 224 OF 1240 *** - - // Wavefunction(s) for diagram number 224 - VVV1P0_1( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] ); - - // Amplitude(s) for diagram number 224 - VVV1_0( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 224 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 225 OF 1240 *** - - // Wavefunction(s) for diagram number 225 - // (none) - - // Amplitude(s) for diagram number 225 - VVV1_0( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 225 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 226 OF 1240 *** - - // Wavefunction(s) for diagram number 226 - // (none) - - // Amplitude(s) for diagram number 226 - VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 227 OF 1240 *** - - // Wavefunction(s) for diagram number 227 - // (none) - - // Amplitude(s) for diagram number 227 - FFV1_0( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 227 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 228 OF 1240 *** - - // Wavefunction(s) for diagram number 228 - // (none) - - // Amplitude(s) for diagram number 228 - FFV1_0( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 228 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 229 OF 1240 *** - - // Wavefunction(s) for diagram number 229 - // (none) - - // Amplitude(s) for diagram number 229 - FFV1_0( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 229 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 230 OF 1240 *** - - // Wavefunction(s) for diagram number 230 - // (none) - - // Amplitude(s) for diagram number 230 - FFV1_0( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 230 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 231 OF 1240 *** - - // Wavefunction(s) for diagram number 231 - VVV1P0_1( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] ); - - // Amplitude(s) for diagram number 231 - VVV1_0( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 231 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 232 OF 1240 *** - - // Wavefunction(s) for diagram number 232 - // (none) - - // Amplitude(s) for diagram number 232 - VVV1_0( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 232 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 233 OF 1240 *** - - // Wavefunction(s) for diagram number 233 - // (none) - - // Amplitude(s) for diagram number 233 - VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 234 OF 1240 *** - - // Wavefunction(s) for diagram number 234 - // (none) - - // Amplitude(s) for diagram number 234 - FFV1_0( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 234 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 235 OF 1240 *** - - // Wavefunction(s) for diagram number 235 - // (none) - - // Amplitude(s) for diagram number 235 - FFV1_0( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 235 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 236 OF 1240 *** - - // Wavefunction(s) for diagram number 236 - VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] ); - VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] ); - VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] ); - - // Amplitude(s) for diagram number 236 - VVV1_0( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 237 OF 1240 *** - - // Wavefunction(s) for diagram number 237 - // (none) - - // Amplitude(s) for diagram number 237 - FFV1_0( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 238 OF 1240 *** - - // Wavefunction(s) for diagram number 238 - // (none) - - // Amplitude(s) for diagram number 238 - FFV1_0( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 239 OF 1240 *** - - // Wavefunction(s) for diagram number 239 - VVVV1P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] ); - VVVV3P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] ); - VVVV4P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] ); - - // Amplitude(s) for diagram number 239 - VVV1_0( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 240 OF 1240 *** - - // Wavefunction(s) for diagram number 240 - // (none) - - // Amplitude(s) for diagram number 240 - FFV1_0( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 241 OF 1240 *** - - // Wavefunction(s) for diagram number 241 - // (none) - - // Amplitude(s) for diagram number 241 - FFV1_0( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 242 OF 1240 *** - - // Wavefunction(s) for diagram number 242 - VVVV1P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] ); - VVVV3P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] ); - VVVV4P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] ); - - // Amplitude(s) for diagram number 242 - VVV1_0( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 243 OF 1240 *** - - // Wavefunction(s) for diagram number 243 - // (none) - - // Amplitude(s) for diagram number 243 - FFV1_0( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 244 OF 1240 *** - - // Wavefunction(s) for diagram number 244 - // (none) - - // Amplitude(s) for diagram number 244 - FFV1_0( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 245 OF 1240 *** - - // Wavefunction(s) for diagram number 245 - // (none) - - // Amplitude(s) for diagram number 245 - FFV1_0( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 246 OF 1240 *** - - // Wavefunction(s) for diagram number 246 - // (none) - - // Amplitude(s) for diagram number 246 - VVV1_0( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 247 OF 1240 *** - - // Wavefunction(s) for diagram number 247 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); - FFV1_2( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); - FFV1_1( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 247 - FFV1_0( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 247 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[41] -= amp_sv[0]; - - // *** DIAGRAM 248 OF 1240 *** - - // Wavefunction(s) for diagram number 248 - FFV1_1( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] ); - - // Amplitude(s) for diagram number 248 - FFV1_0( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 248 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[47] -= amp_sv[0]; - - // *** DIAGRAM 249 OF 1240 *** - - // Wavefunction(s) for diagram number 249 - FFV1_2( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] ); - FFV1_1( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] ); - - // Amplitude(s) for diagram number 249 - FFV1_0( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 249 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] -= amp_sv[0]; - - // *** DIAGRAM 250 OF 1240 *** - - // Wavefunction(s) for diagram number 250 - // (none) - - // Amplitude(s) for diagram number 250 - FFV1_0( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 250 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[45] -= amp_sv[0]; - - // *** DIAGRAM 251 OF 1240 *** - - // Wavefunction(s) for diagram number 251 - FFV1_2( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] ); - - // Amplitude(s) for diagram number 251 - FFV1_0( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 251 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] -= amp_sv[0]; - - // *** DIAGRAM 252 OF 1240 *** - - // Wavefunction(s) for diagram number 252 - // (none) - - // Amplitude(s) for diagram number 252 - FFV1_0( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 252 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[39] -= amp_sv[0]; - - // *** DIAGRAM 253 OF 1240 *** - - // Wavefunction(s) for diagram number 253 - FFV1P0_3( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] ); - - // Amplitude(s) for diagram number 253 - VVV1_0( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 253 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 254 OF 1240 *** - - // Wavefunction(s) for diagram number 254 - FFV1_2( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); - - // Amplitude(s) for diagram number 254 - FFV1_0( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 254 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 255 OF 1240 *** - - // Wavefunction(s) for diagram number 255 - // (none) - - // Amplitude(s) for diagram number 255 - FFV1_0( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 255 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 256 OF 1240 *** - - // Wavefunction(s) for diagram number 256 - // (none) - - // Amplitude(s) for diagram number 256 - VVV1_0( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 256 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - - // *** DIAGRAM 257 OF 1240 *** - - // Wavefunction(s) for diagram number 257 - FFV1_2( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] ); - - // Amplitude(s) for diagram number 257 - FFV1_0( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 257 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 258 OF 1240 *** - - // Wavefunction(s) for diagram number 258 - // (none) - - // Amplitude(s) for diagram number 258 - FFV1_0( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 258 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 259 OF 1240 *** - - // Wavefunction(s) for diagram number 259 - // (none) - - // Amplitude(s) for diagram number 259 - VVV1_0( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 259 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 260 OF 1240 *** - - // Wavefunction(s) for diagram number 260 - // (none) - - // Amplitude(s) for diagram number 260 - FFV1_0( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 260 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 261 OF 1240 *** - - // Wavefunction(s) for diagram number 261 - FFV1_2( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] ); - - // Amplitude(s) for diagram number 261 - FFV1_0( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 261 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 262 OF 1240 *** - - // Wavefunction(s) for diagram number 262 - // (none) - - // Amplitude(s) for diagram number 262 - FFV1_0( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[35] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - - // *** DIAGRAM 263 OF 1240 *** - - // Wavefunction(s) for diagram number 263 - FFV1P0_3( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] ); - - // Amplitude(s) for diagram number 263 - VVV1_0( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 263 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 264 OF 1240 *** - - // Wavefunction(s) for diagram number 264 - // (none) - - // Amplitude(s) for diagram number 264 - VVV1_0( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 264 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 265 OF 1240 *** - - // Wavefunction(s) for diagram number 265 - // (none) - - // Amplitude(s) for diagram number 265 - VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 266 OF 1240 *** - - // Wavefunction(s) for diagram number 266 - FFV1_1( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] ); - - // Amplitude(s) for diagram number 266 - FFV1_0( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 266 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 267 OF 1240 *** - - // Wavefunction(s) for diagram number 267 - // (none) - - // Amplitude(s) for diagram number 267 - FFV1_0( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 267 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 268 OF 1240 *** - - // Wavefunction(s) for diagram number 268 - // (none) - - // Amplitude(s) for diagram number 268 - FFV1_0( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 268 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 269 OF 1240 *** - - // Wavefunction(s) for diagram number 269 - // (none) - - // Amplitude(s) for diagram number 269 - FFV1_0( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 269 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - - // *** DIAGRAM 270 OF 1240 *** - - // Wavefunction(s) for diagram number 270 - FFV1_2( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] ); - - // Amplitude(s) for diagram number 270 - FFV1_0( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 270 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 271 OF 1240 *** - - // Wavefunction(s) for diagram number 271 - FFV1P0_3( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] ); - - // Amplitude(s) for diagram number 271 - VVV1_0( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 271 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 272 OF 1240 *** - - // Wavefunction(s) for diagram number 272 - // (none) - - // Amplitude(s) for diagram number 272 - FFV1_0( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 272 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 273 OF 1240 *** - - // Wavefunction(s) for diagram number 273 - // (none) - - // Amplitude(s) for diagram number 273 - FFV1_0( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 273 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 274 OF 1240 *** - - // Wavefunction(s) for diagram number 274 - FFV1P0_3( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] ); - - // Amplitude(s) for diagram number 274 - VVV1_0( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 274 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 275 OF 1240 *** - - // Wavefunction(s) for diagram number 275 - // (none) - - // Amplitude(s) for diagram number 275 - FFV1_0( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 275 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 276 OF 1240 *** - - // Wavefunction(s) for diagram number 276 - // (none) - - // Amplitude(s) for diagram number 276 - FFV1_0( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 276 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 277 OF 1240 *** - - // Wavefunction(s) for diagram number 277 - // (none) - - // Amplitude(s) for diagram number 277 - VVV1_0( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 277 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 278 OF 1240 *** - - // Wavefunction(s) for diagram number 278 - // (none) - - // Amplitude(s) for diagram number 278 - FFV1_0( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 278 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - - // *** DIAGRAM 279 OF 1240 *** - - // Wavefunction(s) for diagram number 279 - // (none) - - // Amplitude(s) for diagram number 279 - VVV1_0( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 279 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 280 OF 1240 *** - - // Wavefunction(s) for diagram number 280 - // (none) - - // Amplitude(s) for diagram number 280 - VVV1_0( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 280 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 281 OF 1240 *** - - // Wavefunction(s) for diagram number 281 - // (none) - - // Amplitude(s) for diagram number 281 - VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 282 OF 1240 *** - - // Wavefunction(s) for diagram number 282 - FFV1_1( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] ); - - // Amplitude(s) for diagram number 282 - FFV1_0( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 282 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 283 OF 1240 *** - - // Wavefunction(s) for diagram number 283 - // (none) - - // Amplitude(s) for diagram number 283 - FFV1_0( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 283 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[41] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 284 OF 1240 *** - - // Wavefunction(s) for diagram number 284 - // (none) - - // Amplitude(s) for diagram number 284 - FFV1_0( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 284 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 285 OF 1240 *** - - // Wavefunction(s) for diagram number 285 - // (none) - - // Amplitude(s) for diagram number 285 - FFV1_0( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 285 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[39] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - - // *** DIAGRAM 286 OF 1240 *** - - // Wavefunction(s) for diagram number 286 - FFV1_2( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] ); - - // Amplitude(s) for diagram number 286 - FFV1_0( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 286 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 287 OF 1240 *** - - // Wavefunction(s) for diagram number 287 - FFV1P0_3( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] ); - - // Amplitude(s) for diagram number 287 - VVV1_0( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 287 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 288 OF 1240 *** - - // Wavefunction(s) for diagram number 288 - // (none) - - // Amplitude(s) for diagram number 288 - FFV1_0( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 288 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 289 OF 1240 *** - - // Wavefunction(s) for diagram number 289 - // (none) - - // Amplitude(s) for diagram number 289 - FFV1_0( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 289 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 290 OF 1240 *** - - // Wavefunction(s) for diagram number 290 - // (none) - - // Amplitude(s) for diagram number 290 - VVV1_0( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 290 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 291 OF 1240 *** - - // Wavefunction(s) for diagram number 291 - // (none) - - // Amplitude(s) for diagram number 291 - FFV1_0( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 291 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 292 OF 1240 *** - - // Wavefunction(s) for diagram number 292 - // (none) - - // Amplitude(s) for diagram number 292 - FFV1_0( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 292 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 293 OF 1240 *** - - // Wavefunction(s) for diagram number 293 - // (none) - - // Amplitude(s) for diagram number 293 - VVV1_0( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 293 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 294 OF 1240 *** - - // Wavefunction(s) for diagram number 294 - // (none) - - // Amplitude(s) for diagram number 294 - FFV1_0( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 294 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - - // *** DIAGRAM 295 OF 1240 *** - - // Wavefunction(s) for diagram number 295 - // (none) - - // Amplitude(s) for diagram number 295 - VVV1_0( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 295 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 296 OF 1240 *** - - // Wavefunction(s) for diagram number 296 - // (none) - - // Amplitude(s) for diagram number 296 - VVV1_0( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 296 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 297 OF 1240 *** - - // Wavefunction(s) for diagram number 297 - // (none) - - // Amplitude(s) for diagram number 297 - VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 298 OF 1240 *** - - // Wavefunction(s) for diagram number 298 - FFV1_1( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] ); - - // Amplitude(s) for diagram number 298 - FFV1_0( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 298 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 299 OF 1240 *** - - // Wavefunction(s) for diagram number 299 - // (none) - - // Amplitude(s) for diagram number 299 - FFV1_0( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 299 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[47] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 300 OF 1240 *** - - // Wavefunction(s) for diagram number 300 - // (none) - - // Amplitude(s) for diagram number 300 - FFV1_0( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 300 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 301 OF 1240 *** - - // Wavefunction(s) for diagram number 301 - // (none) - - // Amplitude(s) for diagram number 301 - FFV1_0( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 301 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[45] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - - // *** DIAGRAM 302 OF 1240 *** - - // Wavefunction(s) for diagram number 302 - FFV1_2( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 302 - FFV1_0( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 302 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 303 OF 1240 *** - - // Wavefunction(s) for diagram number 303 - // (none) - - // Amplitude(s) for diagram number 303 - VVV1_0( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 303 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - - // *** DIAGRAM 304 OF 1240 *** - - // Wavefunction(s) for diagram number 304 - // (none) - - // Amplitude(s) for diagram number 304 - FFV1_0( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 304 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 305 OF 1240 *** - - // Wavefunction(s) for diagram number 305 - // (none) - - // Amplitude(s) for diagram number 305 - FFV1_0( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 305 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 306 OF 1240 *** - - // Wavefunction(s) for diagram number 306 - // (none) - - // Amplitude(s) for diagram number 306 - VVV1_0( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 306 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - - // *** DIAGRAM 307 OF 1240 *** - - // Wavefunction(s) for diagram number 307 - // (none) - - // Amplitude(s) for diagram number 307 - FFV1_0( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 307 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 308 OF 1240 *** - - // Wavefunction(s) for diagram number 308 - // (none) - - // Amplitude(s) for diagram number 308 - FFV1_0( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 308 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - - // *** DIAGRAM 309 OF 1240 *** - - // Wavefunction(s) for diagram number 309 - // (none) - - // Amplitude(s) for diagram number 309 - VVV1_0( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 309 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 310 OF 1240 *** - - // Wavefunction(s) for diagram number 310 - // (none) - - // Amplitude(s) for diagram number 310 - FFV1_0( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 310 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 311 OF 1240 *** - - // Wavefunction(s) for diagram number 311 - FFV1_2( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 311 - FFV1_0( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 311 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[65] -= amp_sv[0]; - - // *** DIAGRAM 312 OF 1240 *** - - // Wavefunction(s) for diagram number 312 - // (none) - - // Amplitude(s) for diagram number 312 - FFV1_0( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 312 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[71] -= amp_sv[0]; - - // *** DIAGRAM 313 OF 1240 *** - - // Wavefunction(s) for diagram number 313 - FFV1_1( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] ); - - // Amplitude(s) for diagram number 313 - FFV1_0( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 313 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[59] -= amp_sv[0]; - - // *** DIAGRAM 314 OF 1240 *** - - // Wavefunction(s) for diagram number 314 - // (none) - - // Amplitude(s) for diagram number 314 - FFV1_0( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 314 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[69] -= amp_sv[0]; - - // *** DIAGRAM 315 OF 1240 *** - - // Wavefunction(s) for diagram number 315 - // (none) - - // Amplitude(s) for diagram number 315 - FFV1_0( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 315 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[57] -= amp_sv[0]; - - // *** DIAGRAM 316 OF 1240 *** - - // Wavefunction(s) for diagram number 316 - // (none) - - // Amplitude(s) for diagram number 316 - FFV1_0( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 316 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[63] -= amp_sv[0]; - - // *** DIAGRAM 317 OF 1240 *** - - // Wavefunction(s) for diagram number 317 - // (none) - - // Amplitude(s) for diagram number 317 - FFV1_0( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 317 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 318 OF 1240 *** - - // Wavefunction(s) for diagram number 318 - // (none) - - // Amplitude(s) for diagram number 318 - VVV1_0( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 318 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 319 OF 1240 *** - - // Wavefunction(s) for diagram number 319 - // (none) - - // Amplitude(s) for diagram number 319 - FFV1_0( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 319 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 320 OF 1240 *** - - // Wavefunction(s) for diagram number 320 - // (none) - - // Amplitude(s) for diagram number 320 - FFV1_0( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 320 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[89] -= amp_sv[0]; - - // *** DIAGRAM 321 OF 1240 *** - - // Wavefunction(s) for diagram number 321 - // (none) - - // Amplitude(s) for diagram number 321 - FFV1_0( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 321 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[95] -= amp_sv[0]; - - // *** DIAGRAM 322 OF 1240 *** - - // Wavefunction(s) for diagram number 322 - FFV1_1( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] ); - - // Amplitude(s) for diagram number 322 - FFV1_0( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 322 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[83] -= amp_sv[0]; - - // *** DIAGRAM 323 OF 1240 *** - - // Wavefunction(s) for diagram number 323 - // (none) - - // Amplitude(s) for diagram number 323 - FFV1_0( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 323 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[93] -= amp_sv[0]; - - // *** DIAGRAM 324 OF 1240 *** - - // Wavefunction(s) for diagram number 324 - // (none) - - // Amplitude(s) for diagram number 324 - FFV1_0( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 324 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[81] -= amp_sv[0]; - - // *** DIAGRAM 325 OF 1240 *** - - // Wavefunction(s) for diagram number 325 - // (none) - - // Amplitude(s) for diagram number 325 - FFV1_0( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 325 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[87] -= amp_sv[0]; - - // *** DIAGRAM 326 OF 1240 *** - - // Wavefunction(s) for diagram number 326 - // (none) - - // Amplitude(s) for diagram number 326 - FFV1_0( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 326 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 327 OF 1240 *** - - // Wavefunction(s) for diagram number 327 - // (none) - - // Amplitude(s) for diagram number 327 - VVV1_0( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 327 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 328 OF 1240 *** - - // Wavefunction(s) for diagram number 328 - // (none) - - // Amplitude(s) for diagram number 328 - FFV1_0( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 328 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 329 OF 1240 *** - - // Wavefunction(s) for diagram number 329 - // (none) - - // Amplitude(s) for diagram number 329 - FFV1_0( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 329 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 330 OF 1240 *** - - // Wavefunction(s) for diagram number 330 - // (none) - - // Amplitude(s) for diagram number 330 - FFV1_0( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 330 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 331 OF 1240 *** - - // Wavefunction(s) for diagram number 331 - FFV1_1( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] ); - - // Amplitude(s) for diagram number 331 - FFV1_0( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 331 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 332 OF 1240 *** - - // Wavefunction(s) for diagram number 332 - // (none) - - // Amplitude(s) for diagram number 332 - FFV1_0( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 332 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 333 OF 1240 *** - - // Wavefunction(s) for diagram number 333 - // (none) - - // Amplitude(s) for diagram number 333 - FFV1_0( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 333 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[105] -= amp_sv[0]; - - // *** DIAGRAM 334 OF 1240 *** - - // Wavefunction(s) for diagram number 334 - // (none) - - // Amplitude(s) for diagram number 334 - FFV1_0( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 334 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 335 OF 1240 *** - - // Wavefunction(s) for diagram number 335 - // (none) - - // Amplitude(s) for diagram number 335 - FFV1_0( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 335 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 336 OF 1240 *** - - // Wavefunction(s) for diagram number 336 - // (none) - - // Amplitude(s) for diagram number 336 - VVV1_0( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 336 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 337 OF 1240 *** - - // Wavefunction(s) for diagram number 337 - // (none) - - // Amplitude(s) for diagram number 337 - FFV1_0( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 337 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 338 OF 1240 *** - - // Wavefunction(s) for diagram number 338 - // (none) - - // Amplitude(s) for diagram number 338 - FFV1_0( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 338 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 339 OF 1240 *** - - // Wavefunction(s) for diagram number 339 - // (none) - - // Amplitude(s) for diagram number 339 - FFV1_0( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 339 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[65] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 340 OF 1240 *** - - // Wavefunction(s) for diagram number 340 - // (none) - - // Amplitude(s) for diagram number 340 - VVV1_0( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 340 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 341 OF 1240 *** - - // Wavefunction(s) for diagram number 341 - // (none) - - // Amplitude(s) for diagram number 341 - VVV1_0( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 341 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 342 OF 1240 *** - - // Wavefunction(s) for diagram number 342 - // (none) - - // Amplitude(s) for diagram number 342 - VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 343 OF 1240 *** - - // Wavefunction(s) for diagram number 343 - // (none) - - // Amplitude(s) for diagram number 343 - FFV1_0( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 343 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - - // *** DIAGRAM 344 OF 1240 *** - - // Wavefunction(s) for diagram number 344 - // (none) - - // Amplitude(s) for diagram number 344 - FFV1_0( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 344 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 345 OF 1240 *** - - // Wavefunction(s) for diagram number 345 - // (none) - - // Amplitude(s) for diagram number 345 - FFV1_0( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 345 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 346 OF 1240 *** - - // Wavefunction(s) for diagram number 346 - // (none) - - // Amplitude(s) for diagram number 346 - FFV1_0( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 346 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[71] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 347 OF 1240 *** - - // Wavefunction(s) for diagram number 347 - // (none) - - // Amplitude(s) for diagram number 347 - VVV1_0( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 347 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 348 OF 1240 *** - - // Wavefunction(s) for diagram number 348 - // (none) - - // Amplitude(s) for diagram number 348 - VVV1_0( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 348 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 349 OF 1240 *** - - // Wavefunction(s) for diagram number 349 - // (none) - - // Amplitude(s) for diagram number 349 - VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 350 OF 1240 *** - - // Wavefunction(s) for diagram number 350 - // (none) - - // Amplitude(s) for diagram number 350 - FFV1_0( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 350 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[35] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 351 OF 1240 *** - - // Wavefunction(s) for diagram number 351 - // (none) - - // Amplitude(s) for diagram number 351 - FFV1_0( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 351 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 352 OF 1240 *** - - // Wavefunction(s) for diagram number 352 - // (none) - - // Amplitude(s) for diagram number 352 - FFV1_0( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 352 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 353 OF 1240 *** - - // Wavefunction(s) for diagram number 353 - // (none) - - // Amplitude(s) for diagram number 353 - FFV1_0( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 353 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 354 OF 1240 *** - - // Wavefunction(s) for diagram number 354 - // (none) - - // Amplitude(s) for diagram number 354 - VVV1_0( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 354 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 355 OF 1240 *** - - // Wavefunction(s) for diagram number 355 - // (none) - - // Amplitude(s) for diagram number 355 - VVV1_0( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 355 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 356 OF 1240 *** - - // Wavefunction(s) for diagram number 356 - // (none) - - // Amplitude(s) for diagram number 356 - VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 357 OF 1240 *** - - // Wavefunction(s) for diagram number 357 - // (none) - - // Amplitude(s) for diagram number 357 - FFV1_0( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 357 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 358 OF 1240 *** - - // Wavefunction(s) for diagram number 358 - // (none) - - // Amplitude(s) for diagram number 358 - FFV1_0( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 358 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 359 OF 1240 *** - - // Wavefunction(s) for diagram number 359 - // (none) - - // Amplitude(s) for diagram number 359 - VVV1_0( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 360 OF 1240 *** - - // Wavefunction(s) for diagram number 360 - // (none) - - // Amplitude(s) for diagram number 360 - FFV1_0( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - FFV1_0( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[39] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - FFV1_0( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[33] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - - // *** DIAGRAM 361 OF 1240 *** - - // Wavefunction(s) for diagram number 361 - // (none) - - // Amplitude(s) for diagram number 361 - FFV1_0( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 362 OF 1240 *** - - // Wavefunction(s) for diagram number 362 - // (none) - - // Amplitude(s) for diagram number 362 - VVV1_0( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 363 OF 1240 *** - - // Wavefunction(s) for diagram number 363 - // (none) - - // Amplitude(s) for diagram number 363 - FFV1_0( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[35] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - FFV1_0( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[45] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - FFV1_0( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[35] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 364 OF 1240 *** - - // Wavefunction(s) for diagram number 364 - // (none) - - // Amplitude(s) for diagram number 364 - FFV1_0( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - - // *** DIAGRAM 365 OF 1240 *** - - // Wavefunction(s) for diagram number 365 - // (none) - - // Amplitude(s) for diagram number 365 - VVV1_0( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 366 OF 1240 *** - - // Wavefunction(s) for diagram number 366 - // (none) - - // Amplitude(s) for diagram number 366 - FFV1_0( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - FFV1_0( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[47] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - FFV1_0( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[41] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 367 OF 1240 *** - - // Wavefunction(s) for diagram number 367 - // (none) - - // Amplitude(s) for diagram number 367 - FFV1_0( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - - // *** DIAGRAM 368 OF 1240 *** - - // Wavefunction(s) for diagram number 368 - // (none) - - // Amplitude(s) for diagram number 368 - FFV1_0( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 369 OF 1240 *** - - // Wavefunction(s) for diagram number 369 - // (none) - - // Amplitude(s) for diagram number 369 - VVV1_0( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 370 OF 1240 *** - - // Wavefunction(s) for diagram number 370 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] ); - FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 370 - FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 370 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 371 OF 1240 *** - - // Wavefunction(s) for diagram number 371 - // (none) - - // Amplitude(s) for diagram number 371 - FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 371 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 372 OF 1240 *** - - // Wavefunction(s) for diagram number 372 - VVV1P0_1( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] ); - FFV1P0_3( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] ); - - // Amplitude(s) for diagram number 372 - VVV1_0( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 372 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 373 OF 1240 *** - - // Wavefunction(s) for diagram number 373 - // (none) - - // Amplitude(s) for diagram number 373 - FFV1_0( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 373 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 374 OF 1240 *** - - // Wavefunction(s) for diagram number 374 - VVV1P0_1( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 374 - VVV1_0( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 374 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 375 OF 1240 *** - - // Wavefunction(s) for diagram number 375 - // (none) - - // Amplitude(s) for diagram number 375 - FFV1_0( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 375 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - - // *** DIAGRAM 376 OF 1240 *** - - // Wavefunction(s) for diagram number 376 - VVVV1P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] ); - VVVV3P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] ); - VVVV4P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] ); - - // Amplitude(s) for diagram number 376 - FFV1_0( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 377 OF 1240 *** - - // Wavefunction(s) for diagram number 377 - FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] ); - - // Amplitude(s) for diagram number 377 - FFV1_0( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 377 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 378 OF 1240 *** - - // Wavefunction(s) for diagram number 378 - FFV1_2( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - - // Amplitude(s) for diagram number 378 - FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 378 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 379 OF 1240 *** - - // Wavefunction(s) for diagram number 379 - // (none) - - // Amplitude(s) for diagram number 379 - FFV1_0( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 379 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - - // *** DIAGRAM 380 OF 1240 *** - - // Wavefunction(s) for diagram number 380 - // (none) - - // Amplitude(s) for diagram number 380 - FFV1_0( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 380 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 381 OF 1240 *** - - // Wavefunction(s) for diagram number 381 - FFV1_2( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] ); - - // Amplitude(s) for diagram number 381 - FFV1_0( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 381 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 382 OF 1240 *** - - // Wavefunction(s) for diagram number 382 - // (none) - - // Amplitude(s) for diagram number 382 - FFV1_0( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 382 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - - // *** DIAGRAM 383 OF 1240 *** - - // Wavefunction(s) for diagram number 383 - // (none) - - // Amplitude(s) for diagram number 383 - FFV1_0( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 383 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - - // *** DIAGRAM 384 OF 1240 *** - - // Wavefunction(s) for diagram number 384 - // (none) - - // Amplitude(s) for diagram number 384 - FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 384 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 385 OF 1240 *** - - // Wavefunction(s) for diagram number 385 - VVV1P0_1( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] ); - - // Amplitude(s) for diagram number 385 - FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 385 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 386 OF 1240 *** - - // Wavefunction(s) for diagram number 386 - FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); - - // Amplitude(s) for diagram number 386 - FFV1_0( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 386 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 387 OF 1240 *** - - // Wavefunction(s) for diagram number 387 - // (none) - - // Amplitude(s) for diagram number 387 - FFV1_0( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 387 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 388 OF 1240 *** - - // Wavefunction(s) for diagram number 388 - FFV1P0_3( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] ); - - // Amplitude(s) for diagram number 388 - VVV1_0( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 388 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 389 OF 1240 *** - - // Wavefunction(s) for diagram number 389 - // (none) - - // Amplitude(s) for diagram number 389 - FFV1_0( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 389 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - - // *** DIAGRAM 390 OF 1240 *** - - // Wavefunction(s) for diagram number 390 - // (none) - - // Amplitude(s) for diagram number 390 - VVV1_0( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 390 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 391 OF 1240 *** - - // Wavefunction(s) for diagram number 391 - // (none) - - // Amplitude(s) for diagram number 391 - FFV1_0( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 391 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 392 OF 1240 *** - - // Wavefunction(s) for diagram number 392 - // (none) - - // Amplitude(s) for diagram number 392 - FFV1_0( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 393 OF 1240 *** - - // Wavefunction(s) for diagram number 393 - FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); - - // Amplitude(s) for diagram number 393 - FFV1_0( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 393 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 394 OF 1240 *** - - // Wavefunction(s) for diagram number 394 - FFV1_1( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] ); - - // Amplitude(s) for diagram number 394 - FFV1_0( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 394 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 395 OF 1240 *** - - // Wavefunction(s) for diagram number 395 - // (none) - - // Amplitude(s) for diagram number 395 - FFV1_0( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 395 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[75] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - - // *** DIAGRAM 396 OF 1240 *** - - // Wavefunction(s) for diagram number 396 - // (none) - - // Amplitude(s) for diagram number 396 - FFV1_0( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 396 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 397 OF 1240 *** - - // Wavefunction(s) for diagram number 397 - FFV1_1( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); - - // Amplitude(s) for diagram number 397 - FFV1_0( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 397 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 398 OF 1240 *** - - // Wavefunction(s) for diagram number 398 - // (none) - - // Amplitude(s) for diagram number 398 - FFV1_0( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 398 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[99] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 399 OF 1240 *** - - // Wavefunction(s) for diagram number 399 - // (none) - - // Amplitude(s) for diagram number 399 - FFV1_0( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 399 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 400 OF 1240 *** - - // Wavefunction(s) for diagram number 400 - // (none) - - // Amplitude(s) for diagram number 400 - FFV1_0( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 400 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - - // *** DIAGRAM 401 OF 1240 *** - - // Wavefunction(s) for diagram number 401 - // (none) - - // Amplitude(s) for diagram number 401 - FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 401 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 402 OF 1240 *** - - // Wavefunction(s) for diagram number 402 - // (none) - - // Amplitude(s) for diagram number 402 - FFV1_0( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 402 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - - // *** DIAGRAM 403 OF 1240 *** - - // Wavefunction(s) for diagram number 403 - // (none) - - // Amplitude(s) for diagram number 403 - FFV1_0( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 403 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 404 OF 1240 *** - - // Wavefunction(s) for diagram number 404 - // (none) - - // Amplitude(s) for diagram number 404 - FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 404 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - - // *** DIAGRAM 405 OF 1240 *** - - // Wavefunction(s) for diagram number 405 - // (none) - - // Amplitude(s) for diagram number 405 - FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 405 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 406 OF 1240 *** - - // Wavefunction(s) for diagram number 406 - // (none) - - // Amplitude(s) for diagram number 406 - FFV1_0( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 406 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 407 OF 1240 *** - - // Wavefunction(s) for diagram number 407 - // (none) - - // Amplitude(s) for diagram number 407 - FFV1_0( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 407 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 408 OF 1240 *** - - // Wavefunction(s) for diagram number 408 - // (none) - - // Amplitude(s) for diagram number 408 - VVVV1_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 409 OF 1240 *** - - // Wavefunction(s) for diagram number 409 - VVV1P0_1( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 409 - VVV1_0( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 409 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 410 OF 1240 *** - - // Wavefunction(s) for diagram number 410 - VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] ); - - // Amplitude(s) for diagram number 410 - VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 410 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 411 OF 1240 *** - - // Wavefunction(s) for diagram number 411 - // (none) - - // Amplitude(s) for diagram number 411 - VVV1_0( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 411 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 412 OF 1240 *** - - // Wavefunction(s) for diagram number 412 - // (none) - - // Amplitude(s) for diagram number 412 - FFV1_0( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 412 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 413 OF 1240 *** - - // Wavefunction(s) for diagram number 413 - // (none) - - // Amplitude(s) for diagram number 413 - FFV1_0( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 413 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 414 OF 1240 *** - - // Wavefunction(s) for diagram number 414 - // (none) - - // Amplitude(s) for diagram number 414 - FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 414 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 415 OF 1240 *** - - // Wavefunction(s) for diagram number 415 - // (none) - - // Amplitude(s) for diagram number 415 - FFV1_0( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 415 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 416 OF 1240 *** - - // Wavefunction(s) for diagram number 416 - // (none) - - // Amplitude(s) for diagram number 416 - FFV1_0( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 416 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - - // *** DIAGRAM 417 OF 1240 *** - - // Wavefunction(s) for diagram number 417 - // (none) - - // Amplitude(s) for diagram number 417 - FFV1_0( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 417 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - - // *** DIAGRAM 418 OF 1240 *** - - // Wavefunction(s) for diagram number 418 - // (none) - - // Amplitude(s) for diagram number 418 - FFV1_0( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 418 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - - // *** DIAGRAM 419 OF 1240 *** - - // Wavefunction(s) for diagram number 419 - // (none) - - // Amplitude(s) for diagram number 419 - FFV1_0( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 419 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 420 OF 1240 *** - - // Wavefunction(s) for diagram number 420 - // (none) - - // Amplitude(s) for diagram number 420 - FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 420 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 421 OF 1240 *** - - // Wavefunction(s) for diagram number 421 - // (none) - - // Amplitude(s) for diagram number 421 - FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 421 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 422 OF 1240 *** - - // Wavefunction(s) for diagram number 422 - // (none) - - // Amplitude(s) for diagram number 422 - FFV1_0( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 422 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 423 OF 1240 *** - - // Wavefunction(s) for diagram number 423 - // (none) - - // Amplitude(s) for diagram number 423 - FFV1_0( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 423 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 424 OF 1240 *** - - // Wavefunction(s) for diagram number 424 - // (none) - - // Amplitude(s) for diagram number 424 - VVVV1_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 425 OF 1240 *** - - // Wavefunction(s) for diagram number 425 - VVV1P0_1( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 425 - VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 425 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - - // *** DIAGRAM 426 OF 1240 *** - - // Wavefunction(s) for diagram number 426 - // (none) - - // Amplitude(s) for diagram number 426 - VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 426 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 427 OF 1240 *** - - // Wavefunction(s) for diagram number 427 - // (none) - - // Amplitude(s) for diagram number 427 - VVV1_0( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 427 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 428 OF 1240 *** - - // Wavefunction(s) for diagram number 428 - // (none) - - // Amplitude(s) for diagram number 428 - FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 428 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 429 OF 1240 *** - - // Wavefunction(s) for diagram number 429 - // (none) - - // Amplitude(s) for diagram number 429 - FFV1_0( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 429 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - - // *** DIAGRAM 430 OF 1240 *** - - // Wavefunction(s) for diagram number 430 - // (none) - - // Amplitude(s) for diagram number 430 - FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 430 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - - // *** DIAGRAM 431 OF 1240 *** - - // Wavefunction(s) for diagram number 431 - // (none) - - // Amplitude(s) for diagram number 431 - FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 431 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 432 OF 1240 *** - - // Wavefunction(s) for diagram number 432 - // (none) - - // Amplitude(s) for diagram number 432 - FFV1_0( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 432 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - - // *** DIAGRAM 433 OF 1240 *** - - // Wavefunction(s) for diagram number 433 - // (none) - - // Amplitude(s) for diagram number 433 - FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 433 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - - // *** DIAGRAM 434 OF 1240 *** - - // Wavefunction(s) for diagram number 434 - VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 434 - VVV1_0( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 434 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 435 OF 1240 *** - - // Wavefunction(s) for diagram number 435 - // (none) - - // Amplitude(s) for diagram number 435 - VVV1_0( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 435 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 436 OF 1240 *** - - // Wavefunction(s) for diagram number 436 - // (none) - - // Amplitude(s) for diagram number 436 - VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 437 OF 1240 *** - - // Wavefunction(s) for diagram number 437 - VVV1P0_1( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] ); - - // Amplitude(s) for diagram number 437 - VVV1_0( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 437 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 438 OF 1240 *** - - // Wavefunction(s) for diagram number 438 - // (none) - - // Amplitude(s) for diagram number 438 - VVV1_0( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 438 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 439 OF 1240 *** - - // Wavefunction(s) for diagram number 439 - // (none) - - // Amplitude(s) for diagram number 439 - VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 440 OF 1240 *** - - // Wavefunction(s) for diagram number 440 - // (none) - - // Amplitude(s) for diagram number 440 - VVV1_0( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 440 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 441 OF 1240 *** - - // Wavefunction(s) for diagram number 441 - // (none) - - // Amplitude(s) for diagram number 441 - VVV1_0( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 441 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - - // *** DIAGRAM 442 OF 1240 *** - - // Wavefunction(s) for diagram number 442 - // (none) - - // Amplitude(s) for diagram number 442 - VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 443 OF 1240 *** - - // Wavefunction(s) for diagram number 443 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 443 - VVV1_0( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 444 OF 1240 *** - - // Wavefunction(s) for diagram number 444 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] ); - - // Amplitude(s) for diagram number 444 - VVV1_0( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 445 OF 1240 *** - - // Wavefunction(s) for diagram number 445 - // (none) - - // Amplitude(s) for diagram number 445 - VVV1_0( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 446 OF 1240 *** - - // Wavefunction(s) for diagram number 446 - // (none) - - // Amplitude(s) for diagram number 446 - VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 447 OF 1240 *** - - // Wavefunction(s) for diagram number 447 - // (none) - - // Amplitude(s) for diagram number 447 - VVV1_0( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 447 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 448 OF 1240 *** - - // Wavefunction(s) for diagram number 448 - // (none) - - // Amplitude(s) for diagram number 448 - VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 448 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 449 OF 1240 *** - - // Wavefunction(s) for diagram number 449 - // (none) - - // Amplitude(s) for diagram number 449 - VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 449 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 450 OF 1240 *** - - // Wavefunction(s) for diagram number 450 - // (none) - - // Amplitude(s) for diagram number 450 - VVV1_0( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 450 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 451 OF 1240 *** - - // Wavefunction(s) for diagram number 451 - // (none) - - // Amplitude(s) for diagram number 451 - FFV1_0( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 451 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[91] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - - // *** DIAGRAM 452 OF 1240 *** - - // Wavefunction(s) for diagram number 452 - // (none) - - // Amplitude(s) for diagram number 452 - FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 452 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 453 OF 1240 *** - - // Wavefunction(s) for diagram number 453 - // (none) - - // Amplitude(s) for diagram number 453 - FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 453 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 454 OF 1240 *** - - // Wavefunction(s) for diagram number 454 - // (none) - - // Amplitude(s) for diagram number 454 - FFV1_0( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 454 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - - // *** DIAGRAM 455 OF 1240 *** - - // Wavefunction(s) for diagram number 455 - // (none) - - // Amplitude(s) for diagram number 455 - VVV1_0( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 455 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 456 OF 1240 *** - - // Wavefunction(s) for diagram number 456 - // (none) - - // Amplitude(s) for diagram number 456 - FFV1_0( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 457 OF 1240 *** - - // Wavefunction(s) for diagram number 457 - // (none) - - // Amplitude(s) for diagram number 457 - FFV1_0( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 457 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - - // *** DIAGRAM 458 OF 1240 *** - - // Wavefunction(s) for diagram number 458 - // (none) - - // Amplitude(s) for diagram number 458 - FFV1_0( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 458 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 459 OF 1240 *** - - // Wavefunction(s) for diagram number 459 - // (none) - - // Amplitude(s) for diagram number 459 - FFV1_0( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 459 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 460 OF 1240 *** - - // Wavefunction(s) for diagram number 460 - // (none) - - // Amplitude(s) for diagram number 460 - VVV1_0( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 460 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 461 OF 1240 *** - - // Wavefunction(s) for diagram number 461 - // (none) - - // Amplitude(s) for diagram number 461 - FFV1_0( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 461 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[115] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 462 OF 1240 *** - - // Wavefunction(s) for diagram number 462 - // (none) - - // Amplitude(s) for diagram number 462 - FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 462 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 463 OF 1240 *** - - // Wavefunction(s) for diagram number 463 - // (none) - - // Amplitude(s) for diagram number 463 - FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 463 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 464 OF 1240 *** - - // Wavefunction(s) for diagram number 464 - // (none) - - // Amplitude(s) for diagram number 464 - FFV1_0( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 464 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 465 OF 1240 *** - - // Wavefunction(s) for diagram number 465 - // (none) - - // Amplitude(s) for diagram number 465 - VVV1_0( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 465 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 466 OF 1240 *** - - // Wavefunction(s) for diagram number 466 - // (none) - - // Amplitude(s) for diagram number 466 - FFV1_0( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 467 OF 1240 *** - - // Wavefunction(s) for diagram number 467 - // (none) - - // Amplitude(s) for diagram number 467 - FFV1_0( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 467 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 468 OF 1240 *** - - // Wavefunction(s) for diagram number 468 - // (none) - - // Amplitude(s) for diagram number 468 - FFV1_0( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 468 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 469 OF 1240 *** - - // Wavefunction(s) for diagram number 469 - // (none) - - // Amplitude(s) for diagram number 469 - FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 469 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 470 OF 1240 *** - - // Wavefunction(s) for diagram number 470 - // (none) - - // Amplitude(s) for diagram number 470 - VVV1_0( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 470 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 471 OF 1240 *** - - // Wavefunction(s) for diagram number 471 - // (none) - - // Amplitude(s) for diagram number 471 - FFV1_0( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 471 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - - // *** DIAGRAM 472 OF 1240 *** - - // Wavefunction(s) for diagram number 472 - // (none) - - // Amplitude(s) for diagram number 472 - FFV1_0( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 472 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 473 OF 1240 *** - - // Wavefunction(s) for diagram number 473 - // (none) - - // Amplitude(s) for diagram number 473 - FFV1_0( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 473 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 474 OF 1240 *** - - // Wavefunction(s) for diagram number 474 - // (none) - - // Amplitude(s) for diagram number 474 - FFV1_0( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 474 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - - // *** DIAGRAM 475 OF 1240 *** - - // Wavefunction(s) for diagram number 475 - // (none) - - // Amplitude(s) for diagram number 475 - VVV1_0( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 475 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 476 OF 1240 *** - - // Wavefunction(s) for diagram number 476 - // (none) - - // Amplitude(s) for diagram number 476 - FFV1_0( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 477 OF 1240 *** - - // Wavefunction(s) for diagram number 477 - // (none) - - // Amplitude(s) for diagram number 477 - VVV1_0( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 477 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 478 OF 1240 *** - - // Wavefunction(s) for diagram number 478 - // (none) - - // Amplitude(s) for diagram number 478 - FFV1_0( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 478 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - - // *** DIAGRAM 479 OF 1240 *** - - // Wavefunction(s) for diagram number 479 - // (none) - - // Amplitude(s) for diagram number 479 - FFV1_0( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 479 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 480 OF 1240 *** - - // Wavefunction(s) for diagram number 480 - // (none) - - // Amplitude(s) for diagram number 480 - FFV1_0( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 480 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 481 OF 1240 *** - - // Wavefunction(s) for diagram number 481 - // (none) - - // Amplitude(s) for diagram number 481 - FFV1_0( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 481 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - - // *** DIAGRAM 482 OF 1240 *** - - // Wavefunction(s) for diagram number 482 - // (none) - - // Amplitude(s) for diagram number 482 - VVV1_0( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 482 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 483 OF 1240 *** - - // Wavefunction(s) for diagram number 483 - // (none) - - // Amplitude(s) for diagram number 483 - FFV1_0( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 484 OF 1240 *** - - // Wavefunction(s) for diagram number 484 - // (none) - - // Amplitude(s) for diagram number 484 - FFV1_0( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 484 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 485 OF 1240 *** - - // Wavefunction(s) for diagram number 485 - // (none) - - // Amplitude(s) for diagram number 485 - FFV1_0( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 485 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 486 OF 1240 *** - - // Wavefunction(s) for diagram number 486 - // (none) - - // Amplitude(s) for diagram number 486 - FFV1_0( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 486 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 487 OF 1240 *** - - // Wavefunction(s) for diagram number 487 - // (none) - - // Amplitude(s) for diagram number 487 - FFV1_0( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 487 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - - // *** DIAGRAM 488 OF 1240 *** - - // Wavefunction(s) for diagram number 488 - // (none) - - // Amplitude(s) for diagram number 488 - FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 488 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 489 OF 1240 *** - - // Wavefunction(s) for diagram number 489 - // (none) - - // Amplitude(s) for diagram number 489 - FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 489 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 490 OF 1240 *** - - // Wavefunction(s) for diagram number 490 - // (none) - - // Amplitude(s) for diagram number 490 - FFV1_0( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 491 OF 1240 *** - - // Wavefunction(s) for diagram number 491 - // (none) - - // Amplitude(s) for diagram number 491 - FFV1_0( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 492 OF 1240 *** - - // Wavefunction(s) for diagram number 492 - // (none) - - // Amplitude(s) for diagram number 492 - VVV1_0( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVV1_0( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 493 OF 1240 *** - - // Wavefunction(s) for diagram number 493 - VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] ); - FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 493 - FFV1_0( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 493 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 494 OF 1240 *** - - // Wavefunction(s) for diagram number 494 - // (none) - - // Amplitude(s) for diagram number 494 - FFV1_0( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 494 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 495 OF 1240 *** - - // Wavefunction(s) for diagram number 495 - VVV1P0_1( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] ); - - // Amplitude(s) for diagram number 495 - VVV1_0( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 495 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 496 OF 1240 *** - - // Wavefunction(s) for diagram number 496 - // (none) - - // Amplitude(s) for diagram number 496 - FFV1_0( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 496 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - - // *** DIAGRAM 497 OF 1240 *** - - // Wavefunction(s) for diagram number 497 - VVV1P0_1( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 497 - VVV1_0( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 497 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 498 OF 1240 *** - - // Wavefunction(s) for diagram number 498 - // (none) - - // Amplitude(s) for diagram number 498 - FFV1_0( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 498 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - - // *** DIAGRAM 499 OF 1240 *** - - // Wavefunction(s) for diagram number 499 - VVVV1P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); - VVVV3P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] ); - VVVV4P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] ); - - // Amplitude(s) for diagram number 499 - FFV1_0( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 500 OF 1240 *** - - // Wavefunction(s) for diagram number 500 - FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); - - // Amplitude(s) for diagram number 500 - FFV1_0( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 500 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 501 OF 1240 *** - - // Wavefunction(s) for diagram number 501 - FFV1_2( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); - - // Amplitude(s) for diagram number 501 - FFV1_0( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 501 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 502 OF 1240 *** - - // Wavefunction(s) for diagram number 502 - // (none) - - // Amplitude(s) for diagram number 502 - FFV1_0( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 502 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - - // *** DIAGRAM 503 OF 1240 *** - - // Wavefunction(s) for diagram number 503 - // (none) - - // Amplitude(s) for diagram number 503 - FFV1_0( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 503 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 504 OF 1240 *** - - // Wavefunction(s) for diagram number 504 - FFV1_2( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] ); - - // Amplitude(s) for diagram number 504 - FFV1_0( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 504 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 505 OF 1240 *** - - // Wavefunction(s) for diagram number 505 - // (none) - - // Amplitude(s) for diagram number 505 - FFV1_0( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 505 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - - // *** DIAGRAM 506 OF 1240 *** - - // Wavefunction(s) for diagram number 506 - // (none) - - // Amplitude(s) for diagram number 506 - FFV1_0( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 506 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - - // *** DIAGRAM 507 OF 1240 *** - - // Wavefunction(s) for diagram number 507 - // (none) - - // Amplitude(s) for diagram number 507 - FFV1_0( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 507 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - - // *** DIAGRAM 508 OF 1240 *** - - // Wavefunction(s) for diagram number 508 - VVV1P0_1( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] ); - - // Amplitude(s) for diagram number 508 - FFV1_0( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 508 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 509 OF 1240 *** - - // Wavefunction(s) for diagram number 509 - FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] ); - - // Amplitude(s) for diagram number 509 - FFV1_0( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 509 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 510 OF 1240 *** - - // Wavefunction(s) for diagram number 510 - // (none) - - // Amplitude(s) for diagram number 510 - FFV1_0( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 510 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 511 OF 1240 *** - - // Wavefunction(s) for diagram number 511 - // (none) - - // Amplitude(s) for diagram number 511 - VVV1_0( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 511 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 512 OF 1240 *** - - // Wavefunction(s) for diagram number 512 - // (none) - - // Amplitude(s) for diagram number 512 - FFV1_0( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 512 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - - // *** DIAGRAM 513 OF 1240 *** - - // Wavefunction(s) for diagram number 513 - // (none) - - // Amplitude(s) for diagram number 513 - VVV1_0( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 513 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 514 OF 1240 *** - - // Wavefunction(s) for diagram number 514 - // (none) - - // Amplitude(s) for diagram number 514 - FFV1_0( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 514 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 515 OF 1240 *** - - // Wavefunction(s) for diagram number 515 - // (none) - - // Amplitude(s) for diagram number 515 - FFV1_0( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 516 OF 1240 *** - - // Wavefunction(s) for diagram number 516 - FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] ); - - // Amplitude(s) for diagram number 516 - FFV1_0( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 516 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 517 OF 1240 *** - - // Wavefunction(s) for diagram number 517 - FFV1_1( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - - // Amplitude(s) for diagram number 517 - FFV1_0( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 517 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 518 OF 1240 *** - - // Wavefunction(s) for diagram number 518 - // (none) - - // Amplitude(s) for diagram number 518 - FFV1_0( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 518 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[51] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - - // *** DIAGRAM 519 OF 1240 *** - - // Wavefunction(s) for diagram number 519 - // (none) - - // Amplitude(s) for diagram number 519 - FFV1_0( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 519 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 520 OF 1240 *** - - // Wavefunction(s) for diagram number 520 - FFV1_1( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); - - // Amplitude(s) for diagram number 520 - FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 520 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 521 OF 1240 *** - - // Wavefunction(s) for diagram number 521 - // (none) - - // Amplitude(s) for diagram number 521 - FFV1_0( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 521 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[101] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 522 OF 1240 *** - - // Wavefunction(s) for diagram number 522 - // (none) - - // Amplitude(s) for diagram number 522 - FFV1_0( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 522 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 523 OF 1240 *** - - // Wavefunction(s) for diagram number 523 - // (none) - - // Amplitude(s) for diagram number 523 - FFV1_0( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 523 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - - // *** DIAGRAM 524 OF 1240 *** - - // Wavefunction(s) for diagram number 524 - // (none) - - // Amplitude(s) for diagram number 524 - FFV1_0( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 524 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 525 OF 1240 *** - - // Wavefunction(s) for diagram number 525 - // (none) - - // Amplitude(s) for diagram number 525 - FFV1_0( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 525 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - - // *** DIAGRAM 526 OF 1240 *** - - // Wavefunction(s) for diagram number 526 - // (none) - - // Amplitude(s) for diagram number 526 - FFV1_0( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 526 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 527 OF 1240 *** - - // Wavefunction(s) for diagram number 527 - // (none) - - // Amplitude(s) for diagram number 527 - FFV1_0( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 527 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - - // *** DIAGRAM 528 OF 1240 *** - - // Wavefunction(s) for diagram number 528 - // (none) - - // Amplitude(s) for diagram number 528 - FFV1_0( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 528 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 529 OF 1240 *** - - // Wavefunction(s) for diagram number 529 - // (none) - - // Amplitude(s) for diagram number 529 - FFV1_0( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 529 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 530 OF 1240 *** - - // Wavefunction(s) for diagram number 530 - // (none) - - // Amplitude(s) for diagram number 530 - FFV1_0( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 530 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 531 OF 1240 *** - - // Wavefunction(s) for diagram number 531 - // (none) - - // Amplitude(s) for diagram number 531 - VVVV1_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 532 OF 1240 *** - - // Wavefunction(s) for diagram number 532 - VVV1P0_1( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 532 - VVV1_0( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 532 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 533 OF 1240 *** - - // Wavefunction(s) for diagram number 533 - VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] ); - - // Amplitude(s) for diagram number 533 - VVV1_0( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 533 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 534 OF 1240 *** - - // Wavefunction(s) for diagram number 534 - // (none) - - // Amplitude(s) for diagram number 534 - VVV1_0( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 534 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 535 OF 1240 *** - - // Wavefunction(s) for diagram number 535 - // (none) - - // Amplitude(s) for diagram number 535 - FFV1_0( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 535 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 536 OF 1240 *** - - // Wavefunction(s) for diagram number 536 - // (none) - - // Amplitude(s) for diagram number 536 - FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 536 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 537 OF 1240 *** - - // Wavefunction(s) for diagram number 537 - // (none) - - // Amplitude(s) for diagram number 537 - FFV1_0( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 537 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 538 OF 1240 *** - - // Wavefunction(s) for diagram number 538 - // (none) - - // Amplitude(s) for diagram number 538 - FFV1_0( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 538 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 539 OF 1240 *** - - // Wavefunction(s) for diagram number 539 - // (none) - - // Amplitude(s) for diagram number 539 - FFV1_0( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 539 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - - // *** DIAGRAM 540 OF 1240 *** - - // Wavefunction(s) for diagram number 540 - // (none) - - // Amplitude(s) for diagram number 540 - FFV1_0( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 540 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - - // *** DIAGRAM 541 OF 1240 *** - - // Wavefunction(s) for diagram number 541 - // (none) - - // Amplitude(s) for diagram number 541 - FFV1_0( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 541 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - - // *** DIAGRAM 542 OF 1240 *** - - // Wavefunction(s) for diagram number 542 - // (none) - - // Amplitude(s) for diagram number 542 - FFV1_0( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 542 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 543 OF 1240 *** - - // Wavefunction(s) for diagram number 543 - // (none) - - // Amplitude(s) for diagram number 543 - FFV1_0( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 543 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - - // *** DIAGRAM 544 OF 1240 *** - - // Wavefunction(s) for diagram number 544 - // (none) - - // Amplitude(s) for diagram number 544 - FFV1_0( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 544 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 545 OF 1240 *** - - // Wavefunction(s) for diagram number 545 - // (none) - - // Amplitude(s) for diagram number 545 - FFV1_0( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 545 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 546 OF 1240 *** - - // Wavefunction(s) for diagram number 546 - // (none) - - // Amplitude(s) for diagram number 546 - FFV1_0( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 546 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 547 OF 1240 *** - - // Wavefunction(s) for diagram number 547 - // (none) - - // Amplitude(s) for diagram number 547 - VVVV1_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - - // *** DIAGRAM 548 OF 1240 *** - - // Wavefunction(s) for diagram number 548 - VVV1P0_1( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 548 - VVV1_0( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 548 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - - // *** DIAGRAM 549 OF 1240 *** - - // Wavefunction(s) for diagram number 549 - // (none) - - // Amplitude(s) for diagram number 549 - VVV1_0( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 549 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - - // *** DIAGRAM 550 OF 1240 *** - - // Wavefunction(s) for diagram number 550 - // (none) - - // Amplitude(s) for diagram number 550 - VVV1_0( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 550 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - - // *** DIAGRAM 551 OF 1240 *** - - // Wavefunction(s) for diagram number 551 - // (none) - - // Amplitude(s) for diagram number 551 - FFV1_0( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 551 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 552 OF 1240 *** - - // Wavefunction(s) for diagram number 552 - // (none) - - // Amplitude(s) for diagram number 552 - FFV1_0( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 552 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - - // *** DIAGRAM 553 OF 1240 *** - - // Wavefunction(s) for diagram number 553 - // (none) - - // Amplitude(s) for diagram number 553 - FFV1_0( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 553 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - - // *** DIAGRAM 554 OF 1240 *** - - // Wavefunction(s) for diagram number 554 - // (none) - - // Amplitude(s) for diagram number 554 - FFV1_0( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 554 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 555 OF 1240 *** - - // Wavefunction(s) for diagram number 555 - // (none) - - // Amplitude(s) for diagram number 555 - FFV1_0( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 555 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - - // *** DIAGRAM 556 OF 1240 *** - - // Wavefunction(s) for diagram number 556 - // (none) - - // Amplitude(s) for diagram number 556 - FFV1_0( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 556 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - - // *** DIAGRAM 557 OF 1240 *** - - // Wavefunction(s) for diagram number 557 - VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 557 - VVV1_0( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 557 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 558 OF 1240 *** - - // Wavefunction(s) for diagram number 558 - // (none) - - // Amplitude(s) for diagram number 558 - VVV1_0( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 558 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 559 OF 1240 *** - - // Wavefunction(s) for diagram number 559 - // (none) - - // Amplitude(s) for diagram number 559 - VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 560 OF 1240 *** - - // Wavefunction(s) for diagram number 560 - // (none) - - // Amplitude(s) for diagram number 560 - VVV1_0( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 560 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 561 OF 1240 *** - - // Wavefunction(s) for diagram number 561 - // (none) - - // Amplitude(s) for diagram number 561 - VVV1_0( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 561 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 562 OF 1240 *** - - // Wavefunction(s) for diagram number 562 - // (none) - - // Amplitude(s) for diagram number 562 - VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[14] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 563 OF 1240 *** - - // Wavefunction(s) for diagram number 563 - // (none) - - // Amplitude(s) for diagram number 563 - VVV1_0( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 563 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 564 OF 1240 *** - - // Wavefunction(s) for diagram number 564 - // (none) - - // Amplitude(s) for diagram number 564 - VVV1_0( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 564 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - - // *** DIAGRAM 565 OF 1240 *** - - // Wavefunction(s) for diagram number 565 - // (none) - - // Amplitude(s) for diagram number 565 - VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 566 OF 1240 *** - - // Wavefunction(s) for diagram number 566 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] ); - - // Amplitude(s) for diagram number 566 - VVV1_0( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[14] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 567 OF 1240 *** - - // Wavefunction(s) for diagram number 567 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] ); - - // Amplitude(s) for diagram number 567 - VVV1_0( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 568 OF 1240 *** - - // Wavefunction(s) for diagram number 568 - // (none) - - // Amplitude(s) for diagram number 568 - VVV1_0( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 569 OF 1240 *** - - // Wavefunction(s) for diagram number 569 - // (none) - - // Amplitude(s) for diagram number 569 - VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 570 OF 1240 *** - - // Wavefunction(s) for diagram number 570 - // (none) - - // Amplitude(s) for diagram number 570 - VVV1_0( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 570 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - - // *** DIAGRAM 571 OF 1240 *** - - // Wavefunction(s) for diagram number 571 - // (none) - - // Amplitude(s) for diagram number 571 - VVV1_0( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 571 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 572 OF 1240 *** - - // Wavefunction(s) for diagram number 572 - // (none) - - // Amplitude(s) for diagram number 572 - VVV1_0( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 572 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 573 OF 1240 *** - - // Wavefunction(s) for diagram number 573 - // (none) - - // Amplitude(s) for diagram number 573 - VVV1_0( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 573 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 574 OF 1240 *** - - // Wavefunction(s) for diagram number 574 - // (none) - - // Amplitude(s) for diagram number 574 - FFV1_0( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 574 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[67] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - - // *** DIAGRAM 575 OF 1240 *** - - // Wavefunction(s) for diagram number 575 - // (none) - - // Amplitude(s) for diagram number 575 - FFV1_0( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 575 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 576 OF 1240 *** - - // Wavefunction(s) for diagram number 576 - // (none) - - // Amplitude(s) for diagram number 576 - FFV1_0( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 576 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 577 OF 1240 *** - - // Wavefunction(s) for diagram number 577 - // (none) - - // Amplitude(s) for diagram number 577 - FFV1_0( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 577 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - - // *** DIAGRAM 578 OF 1240 *** - - // Wavefunction(s) for diagram number 578 - // (none) - - // Amplitude(s) for diagram number 578 - VVV1_0( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 578 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 579 OF 1240 *** - - // Wavefunction(s) for diagram number 579 - // (none) - - // Amplitude(s) for diagram number 579 - FFV1_0( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 580 OF 1240 *** - - // Wavefunction(s) for diagram number 580 - // (none) - - // Amplitude(s) for diagram number 580 - FFV1_0( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 580 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - - // *** DIAGRAM 581 OF 1240 *** - - // Wavefunction(s) for diagram number 581 - // (none) - - // Amplitude(s) for diagram number 581 - FFV1_0( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 581 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 582 OF 1240 *** - - // Wavefunction(s) for diagram number 582 - // (none) - - // Amplitude(s) for diagram number 582 - FFV1_0( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 582 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 583 OF 1240 *** - - // Wavefunction(s) for diagram number 583 - // (none) - - // Amplitude(s) for diagram number 583 - VVV1_0( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 583 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 584 OF 1240 *** - - // Wavefunction(s) for diagram number 584 - // (none) - - // Amplitude(s) for diagram number 584 - FFV1_0( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 584 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[109] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - - // *** DIAGRAM 585 OF 1240 *** - - // Wavefunction(s) for diagram number 585 - // (none) - - // Amplitude(s) for diagram number 585 - FFV1_0( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 585 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 586 OF 1240 *** - - // Wavefunction(s) for diagram number 586 - // (none) - - // Amplitude(s) for diagram number 586 - FFV1_0( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 586 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 587 OF 1240 *** - - // Wavefunction(s) for diagram number 587 - // (none) - - // Amplitude(s) for diagram number 587 - FFV1_0( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 587 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[103] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - - // *** DIAGRAM 588 OF 1240 *** - - // Wavefunction(s) for diagram number 588 - // (none) - - // Amplitude(s) for diagram number 588 - VVV1_0( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 588 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 589 OF 1240 *** - - // Wavefunction(s) for diagram number 589 - // (none) - - // Amplitude(s) for diagram number 589 - FFV1_0( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 590 OF 1240 *** - - // Wavefunction(s) for diagram number 590 - // (none) - - // Amplitude(s) for diagram number 590 - FFV1_0( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 590 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 591 OF 1240 *** - - // Wavefunction(s) for diagram number 591 - // (none) - - // Amplitude(s) for diagram number 591 - FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 591 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 592 OF 1240 *** - - // Wavefunction(s) for diagram number 592 - // (none) - - // Amplitude(s) for diagram number 592 - FFV1_0( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 592 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 593 OF 1240 *** - - // Wavefunction(s) for diagram number 593 - // (none) - - // Amplitude(s) for diagram number 593 - VVV1_0( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 593 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 594 OF 1240 *** - - // Wavefunction(s) for diagram number 594 - // (none) - - // Amplitude(s) for diagram number 594 - FFV1_0( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 594 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - - // *** DIAGRAM 595 OF 1240 *** - - // Wavefunction(s) for diagram number 595 - // (none) - - // Amplitude(s) for diagram number 595 - FFV1_0( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 595 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 596 OF 1240 *** - - // Wavefunction(s) for diagram number 596 - // (none) - - // Amplitude(s) for diagram number 596 - FFV1_0( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 596 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 597 OF 1240 *** - - // Wavefunction(s) for diagram number 597 - // (none) - - // Amplitude(s) for diagram number 597 - FFV1_0( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 597 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - - // *** DIAGRAM 598 OF 1240 *** - - // Wavefunction(s) for diagram number 598 - // (none) - - // Amplitude(s) for diagram number 598 - VVV1_0( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 598 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 599 OF 1240 *** - - // Wavefunction(s) for diagram number 599 - // (none) - - // Amplitude(s) for diagram number 599 - FFV1_0( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 600 OF 1240 *** - - // Wavefunction(s) for diagram number 600 - // (none) - - // Amplitude(s) for diagram number 600 - VVV1_0( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 600 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 601 OF 1240 *** - - // Wavefunction(s) for diagram number 601 - // (none) - - // Amplitude(s) for diagram number 601 - FFV1_0( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 601 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - - // *** DIAGRAM 602 OF 1240 *** - - // Wavefunction(s) for diagram number 602 - // (none) - - // Amplitude(s) for diagram number 602 - FFV1_0( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 602 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 603 OF 1240 *** - - // Wavefunction(s) for diagram number 603 - // (none) - - // Amplitude(s) for diagram number 603 - FFV1_0( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 603 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 604 OF 1240 *** - - // Wavefunction(s) for diagram number 604 - // (none) - - // Amplitude(s) for diagram number 604 - FFV1_0( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 604 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - - // *** DIAGRAM 605 OF 1240 *** - - // Wavefunction(s) for diagram number 605 - // (none) - - // Amplitude(s) for diagram number 605 - VVV1_0( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 605 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 606 OF 1240 *** - - // Wavefunction(s) for diagram number 606 - // (none) - - // Amplitude(s) for diagram number 606 - FFV1_0( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 607 OF 1240 *** - - // Wavefunction(s) for diagram number 607 - // (none) - - // Amplitude(s) for diagram number 607 - FFV1_0( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 607 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 608 OF 1240 *** - - // Wavefunction(s) for diagram number 608 - // (none) - - // Amplitude(s) for diagram number 608 - FFV1_0( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 608 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 609 OF 1240 *** - - // Wavefunction(s) for diagram number 609 - // (none) - - // Amplitude(s) for diagram number 609 - FFV1_0( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 609 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 610 OF 1240 *** - - // Wavefunction(s) for diagram number 610 - // (none) - - // Amplitude(s) for diagram number 610 - FFV1_0( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 610 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - - // *** DIAGRAM 611 OF 1240 *** - - // Wavefunction(s) for diagram number 611 - // (none) - - // Amplitude(s) for diagram number 611 - FFV1_0( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 611 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 612 OF 1240 *** - - // Wavefunction(s) for diagram number 612 - // (none) - - // Amplitude(s) for diagram number 612 - FFV1_0( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 612 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 613 OF 1240 *** - - // Wavefunction(s) for diagram number 613 - // (none) - - // Amplitude(s) for diagram number 613 - FFV1_0( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 614 OF 1240 *** - - // Wavefunction(s) for diagram number 614 - // (none) - - // Amplitude(s) for diagram number 614 - FFV1_0( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 615 OF 1240 *** - - // Wavefunction(s) for diagram number 615 - // (none) - - // Amplitude(s) for diagram number 615 - VVV1_0( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVV1_0( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVV1_0( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 616 OF 1240 *** - - // Wavefunction(s) for diagram number 616 - VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] ); - FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 616 - FFV1_0( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 616 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 617 OF 1240 *** - - // Wavefunction(s) for diagram number 617 - // (none) - - // Amplitude(s) for diagram number 617 - FFV1_0( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 617 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 618 OF 1240 *** - - // Wavefunction(s) for diagram number 618 - VVV1P0_1( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] ); - - // Amplitude(s) for diagram number 618 - VVV1_0( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 618 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 619 OF 1240 *** - - // Wavefunction(s) for diagram number 619 - // (none) - - // Amplitude(s) for diagram number 619 - FFV1_0( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 619 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - - // *** DIAGRAM 620 OF 1240 *** - - // Wavefunction(s) for diagram number 620 - VVV1P0_1( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 620 - VVV1_0( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 620 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 621 OF 1240 *** - - // Wavefunction(s) for diagram number 621 - // (none) - - // Amplitude(s) for diagram number 621 - FFV1_0( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 621 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - - // *** DIAGRAM 622 OF 1240 *** - - // Wavefunction(s) for diagram number 622 - VVVV1P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] ); - VVVV3P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] ); - VVVV4P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] ); - - // Amplitude(s) for diagram number 622 - FFV1_0( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 623 OF 1240 *** - - // Wavefunction(s) for diagram number 623 - FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); - - // Amplitude(s) for diagram number 623 - FFV1_0( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 623 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 624 OF 1240 *** - - // Wavefunction(s) for diagram number 624 - FFV1_2( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] ); - - // Amplitude(s) for diagram number 624 - FFV1_0( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 624 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 625 OF 1240 *** - - // Wavefunction(s) for diagram number 625 - // (none) - - // Amplitude(s) for diagram number 625 - FFV1_0( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 625 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - - // *** DIAGRAM 626 OF 1240 *** - - // Wavefunction(s) for diagram number 626 - // (none) - - // Amplitude(s) for diagram number 626 - FFV1_0( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 626 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 627 OF 1240 *** - - // Wavefunction(s) for diagram number 627 - FFV1_2( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); - - // Amplitude(s) for diagram number 627 - FFV1_0( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 627 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 628 OF 1240 *** - - // Wavefunction(s) for diagram number 628 - // (none) - - // Amplitude(s) for diagram number 628 - FFV1_0( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 628 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - - // *** DIAGRAM 629 OF 1240 *** - - // Wavefunction(s) for diagram number 629 - // (none) - - // Amplitude(s) for diagram number 629 - FFV1_0( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 629 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - - // *** DIAGRAM 630 OF 1240 *** - - // Wavefunction(s) for diagram number 630 - // (none) - - // Amplitude(s) for diagram number 630 - FFV1_0( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 630 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - - // *** DIAGRAM 631 OF 1240 *** - - // Wavefunction(s) for diagram number 631 - VVV1P0_1( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] ); - - // Amplitude(s) for diagram number 631 - FFV1_0( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 631 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 632 OF 1240 *** - - // Wavefunction(s) for diagram number 632 - FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] ); - - // Amplitude(s) for diagram number 632 - FFV1_0( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 632 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 633 OF 1240 *** - - // Wavefunction(s) for diagram number 633 - // (none) - - // Amplitude(s) for diagram number 633 - FFV1_0( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 633 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 634 OF 1240 *** - - // Wavefunction(s) for diagram number 634 - // (none) - - // Amplitude(s) for diagram number 634 - VVV1_0( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 634 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 635 OF 1240 *** - - // Wavefunction(s) for diagram number 635 - // (none) - - // Amplitude(s) for diagram number 635 - FFV1_0( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 635 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 636 OF 1240 *** - - // Wavefunction(s) for diagram number 636 - // (none) - - // Amplitude(s) for diagram number 636 - VVV1_0( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 636 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 637 OF 1240 *** - - // Wavefunction(s) for diagram number 637 - // (none) - - // Amplitude(s) for diagram number 637 - FFV1_0( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 637 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[23] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 638 OF 1240 *** - - // Wavefunction(s) for diagram number 638 - // (none) - - // Amplitude(s) for diagram number 638 - FFV1_0( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 639 OF 1240 *** - - // Wavefunction(s) for diagram number 639 - FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); - - // Amplitude(s) for diagram number 639 - FFV1_0( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 639 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 640 OF 1240 *** - - // Wavefunction(s) for diagram number 640 - FFV1_1( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); - - // Amplitude(s) for diagram number 640 - FFV1_0( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 640 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 641 OF 1240 *** - - // Wavefunction(s) for diagram number 641 - // (none) - - // Amplitude(s) for diagram number 641 - FFV1_0( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 641 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[53] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - - // *** DIAGRAM 642 OF 1240 *** - - // Wavefunction(s) for diagram number 642 - // (none) - - // Amplitude(s) for diagram number 642 - FFV1_0( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 642 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 643 OF 1240 *** - - // Wavefunction(s) for diagram number 643 - FFV1_1( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); - - // Amplitude(s) for diagram number 643 - FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 643 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 644 OF 1240 *** - - // Wavefunction(s) for diagram number 644 - // (none) - - // Amplitude(s) for diagram number 644 - FFV1_0( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 644 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[77] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - - // *** DIAGRAM 645 OF 1240 *** - - // Wavefunction(s) for diagram number 645 - // (none) - - // Amplitude(s) for diagram number 645 - FFV1_0( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 645 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - - // *** DIAGRAM 646 OF 1240 *** - - // Wavefunction(s) for diagram number 646 - // (none) - - // Amplitude(s) for diagram number 646 - FFV1_0( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 646 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - - // *** DIAGRAM 647 OF 1240 *** - - // Wavefunction(s) for diagram number 647 - // (none) - - // Amplitude(s) for diagram number 647 - FFV1_0( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 647 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 648 OF 1240 *** - - // Wavefunction(s) for diagram number 648 - // (none) - - // Amplitude(s) for diagram number 648 - FFV1_0( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 648 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - - // *** DIAGRAM 649 OF 1240 *** - - // Wavefunction(s) for diagram number 649 - // (none) - - // Amplitude(s) for diagram number 649 - FFV1_0( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 649 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 650 OF 1240 *** - - // Wavefunction(s) for diagram number 650 - // (none) - - // Amplitude(s) for diagram number 650 - FFV1_0( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 650 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - - // *** DIAGRAM 651 OF 1240 *** - - // Wavefunction(s) for diagram number 651 - // (none) - - // Amplitude(s) for diagram number 651 - FFV1_0( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 651 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 652 OF 1240 *** - - // Wavefunction(s) for diagram number 652 - // (none) - - // Amplitude(s) for diagram number 652 - FFV1_0( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 652 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 653 OF 1240 *** - - // Wavefunction(s) for diagram number 653 - // (none) - - // Amplitude(s) for diagram number 653 - FFV1_0( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 653 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 654 OF 1240 *** - - // Wavefunction(s) for diagram number 654 - // (none) - - // Amplitude(s) for diagram number 654 - VVVV1_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 655 OF 1240 *** - - // Wavefunction(s) for diagram number 655 - VVV1P0_1( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 655 - VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 655 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - - // *** DIAGRAM 656 OF 1240 *** - - // Wavefunction(s) for diagram number 656 - VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] ); - - // Amplitude(s) for diagram number 656 - VVV1_0( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 656 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 657 OF 1240 *** - - // Wavefunction(s) for diagram number 657 - // (none) - - // Amplitude(s) for diagram number 657 - VVV1_0( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 657 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 658 OF 1240 *** - - // Wavefunction(s) for diagram number 658 - // (none) - - // Amplitude(s) for diagram number 658 - FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 658 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 659 OF 1240 *** - - // Wavefunction(s) for diagram number 659 - // (none) - - // Amplitude(s) for diagram number 659 - FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 659 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - - // *** DIAGRAM 660 OF 1240 *** - - // Wavefunction(s) for diagram number 660 - // (none) - - // Amplitude(s) for diagram number 660 - FFV1_0( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 660 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - - // *** DIAGRAM 661 OF 1240 *** - - // Wavefunction(s) for diagram number 661 - // (none) - - // Amplitude(s) for diagram number 661 - FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 661 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 662 OF 1240 *** - - // Wavefunction(s) for diagram number 662 - // (none) - - // Amplitude(s) for diagram number 662 - FFV1_0( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 662 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - - // *** DIAGRAM 663 OF 1240 *** - - // Wavefunction(s) for diagram number 663 - // (none) - - // Amplitude(s) for diagram number 663 - FFV1_0( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 663 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - - // *** DIAGRAM 664 OF 1240 *** - - // Wavefunction(s) for diagram number 664 - // (none) - - // Amplitude(s) for diagram number 664 - FFV1_0( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 664 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - - // *** DIAGRAM 665 OF 1240 *** - - // Wavefunction(s) for diagram number 665 - // (none) - - // Amplitude(s) for diagram number 665 - FFV1_0( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 665 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 666 OF 1240 *** - - // Wavefunction(s) for diagram number 666 - // (none) - - // Amplitude(s) for diagram number 666 - FFV1_0( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 666 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - - // *** DIAGRAM 667 OF 1240 *** - - // Wavefunction(s) for diagram number 667 - // (none) - - // Amplitude(s) for diagram number 667 - FFV1_0( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 667 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 668 OF 1240 *** - - // Wavefunction(s) for diagram number 668 - // (none) - - // Amplitude(s) for diagram number 668 - FFV1_0( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 668 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 669 OF 1240 *** - - // Wavefunction(s) for diagram number 669 - // (none) - - // Amplitude(s) for diagram number 669 - FFV1_0( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 669 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 670 OF 1240 *** - - // Wavefunction(s) for diagram number 670 - // (none) - - // Amplitude(s) for diagram number 670 - VVVV1_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 671 OF 1240 *** - - // Wavefunction(s) for diagram number 671 - VVV1P0_1( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 671 - VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 671 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - - // *** DIAGRAM 672 OF 1240 *** - - // Wavefunction(s) for diagram number 672 - // (none) - - // Amplitude(s) for diagram number 672 - VVV1_0( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 672 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - - // *** DIAGRAM 673 OF 1240 *** - - // Wavefunction(s) for diagram number 673 - // (none) - - // Amplitude(s) for diagram number 673 - VVV1_0( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 673 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 674 OF 1240 *** - - // Wavefunction(s) for diagram number 674 - // (none) - - // Amplitude(s) for diagram number 674 - FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 674 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 675 OF 1240 *** - - // Wavefunction(s) for diagram number 675 - // (none) - - // Amplitude(s) for diagram number 675 - FFV1_0( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 675 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - - // *** DIAGRAM 676 OF 1240 *** - - // Wavefunction(s) for diagram number 676 - // (none) - - // Amplitude(s) for diagram number 676 - FFV1_0( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 676 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - - // *** DIAGRAM 677 OF 1240 *** - - // Wavefunction(s) for diagram number 677 - // (none) - - // Amplitude(s) for diagram number 677 - FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 677 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 678 OF 1240 *** - - // Wavefunction(s) for diagram number 678 - // (none) - - // Amplitude(s) for diagram number 678 - FFV1_0( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 678 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - - // *** DIAGRAM 679 OF 1240 *** - - // Wavefunction(s) for diagram number 679 - // (none) - - // Amplitude(s) for diagram number 679 - FFV1_0( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 679 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - - // *** DIAGRAM 680 OF 1240 *** - - // Wavefunction(s) for diagram number 680 - VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 680 - VVV1_0( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 680 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - - // *** DIAGRAM 681 OF 1240 *** - - // Wavefunction(s) for diagram number 681 - // (none) - - // Amplitude(s) for diagram number 681 - VVV1_0( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 681 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - - // *** DIAGRAM 682 OF 1240 *** - - // Wavefunction(s) for diagram number 682 - // (none) - - // Amplitude(s) for diagram number 682 - VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - - // *** DIAGRAM 683 OF 1240 *** - - // Wavefunction(s) for diagram number 683 - // (none) - - // Amplitude(s) for diagram number 683 - VVV1_0( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 683 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 684 OF 1240 *** - - // Wavefunction(s) for diagram number 684 - // (none) - - // Amplitude(s) for diagram number 684 - VVV1_0( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 684 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - - // *** DIAGRAM 685 OF 1240 *** - - // Wavefunction(s) for diagram number 685 - // (none) - - // Amplitude(s) for diagram number 685 - VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[20] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - - // *** DIAGRAM 686 OF 1240 *** - - // Wavefunction(s) for diagram number 686 - // (none) - - // Amplitude(s) for diagram number 686 - VVV1_0( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 686 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 687 OF 1240 *** - - // Wavefunction(s) for diagram number 687 - // (none) - - // Amplitude(s) for diagram number 687 - VVV1_0( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 687 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - - // *** DIAGRAM 688 OF 1240 *** - - // Wavefunction(s) for diagram number 688 - // (none) - - // Amplitude(s) for diagram number 688 - VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[22] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - - // *** DIAGRAM 689 OF 1240 *** - - // Wavefunction(s) for diagram number 689 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] ); - - // Amplitude(s) for diagram number 689 - VVV1_0( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[20] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - - // *** DIAGRAM 690 OF 1240 *** - - // Wavefunction(s) for diagram number 690 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 690 - VVV1_0( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[22] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - - // *** DIAGRAM 691 OF 1240 *** - - // Wavefunction(s) for diagram number 691 - // (none) - - // Amplitude(s) for diagram number 691 - VVV1_0( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 692 OF 1240 *** - - // Wavefunction(s) for diagram number 692 - // (none) - - // Amplitude(s) for diagram number 692 - VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 693 OF 1240 *** - - // Wavefunction(s) for diagram number 693 - // (none) - - // Amplitude(s) for diagram number 693 - VVV1_0( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 693 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - - // *** DIAGRAM 694 OF 1240 *** - - // Wavefunction(s) for diagram number 694 - // (none) - - // Amplitude(s) for diagram number 694 - VVV1_0( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 694 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 695 OF 1240 *** - - // Wavefunction(s) for diagram number 695 - // (none) - - // Amplitude(s) for diagram number 695 - VVV1_0( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 695 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 696 OF 1240 *** - - // Wavefunction(s) for diagram number 696 - // (none) - - // Amplitude(s) for diagram number 696 - VVV1_0( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 696 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 697 OF 1240 *** - - // Wavefunction(s) for diagram number 697 - // (none) - - // Amplitude(s) for diagram number 697 - FFV1_0( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 697 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[61] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - - // *** DIAGRAM 698 OF 1240 *** - - // Wavefunction(s) for diagram number 698 - // (none) - - // Amplitude(s) for diagram number 698 - FFV1_0( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 698 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 699 OF 1240 *** - - // Wavefunction(s) for diagram number 699 - // (none) - - // Amplitude(s) for diagram number 699 - FFV1_0( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 699 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 700 OF 1240 *** - - // Wavefunction(s) for diagram number 700 - // (none) - - // Amplitude(s) for diagram number 700 - FFV1_0( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 700 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[55] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - - // *** DIAGRAM 701 OF 1240 *** - - // Wavefunction(s) for diagram number 701 - // (none) - - // Amplitude(s) for diagram number 701 - VVV1_0( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 701 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 702 OF 1240 *** - - // Wavefunction(s) for diagram number 702 - // (none) - - // Amplitude(s) for diagram number 702 - FFV1_0( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 703 OF 1240 *** - - // Wavefunction(s) for diagram number 703 - // (none) - - // Amplitude(s) for diagram number 703 - FFV1_0( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 703 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - - // *** DIAGRAM 704 OF 1240 *** - - // Wavefunction(s) for diagram number 704 - // (none) - - // Amplitude(s) for diagram number 704 - FFV1_0( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 704 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 705 OF 1240 *** - - // Wavefunction(s) for diagram number 705 - // (none) - - // Amplitude(s) for diagram number 705 - FFV1_0( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 705 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 706 OF 1240 *** - - // Wavefunction(s) for diagram number 706 - // (none) - - // Amplitude(s) for diagram number 706 - VVV1_0( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 706 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 707 OF 1240 *** - - // Wavefunction(s) for diagram number 707 - // (none) - - // Amplitude(s) for diagram number 707 - FFV1_0( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 707 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[85] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - - // *** DIAGRAM 708 OF 1240 *** - - // Wavefunction(s) for diagram number 708 - // (none) - - // Amplitude(s) for diagram number 708 - FFV1_0( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 708 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 709 OF 1240 *** - - // Wavefunction(s) for diagram number 709 - // (none) - - // Amplitude(s) for diagram number 709 - FFV1_0( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 709 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 710 OF 1240 *** - - // Wavefunction(s) for diagram number 710 - // (none) - - // Amplitude(s) for diagram number 710 - FFV1_0( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 710 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[79] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - - // *** DIAGRAM 711 OF 1240 *** - - // Wavefunction(s) for diagram number 711 - // (none) - - // Amplitude(s) for diagram number 711 - VVV1_0( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 711 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 712 OF 1240 *** - - // Wavefunction(s) for diagram number 712 - // (none) - - // Amplitude(s) for diagram number 712 - FFV1_0( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 713 OF 1240 *** - - // Wavefunction(s) for diagram number 713 - // (none) - - // Amplitude(s) for diagram number 713 - FFV1_0( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 713 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - - // *** DIAGRAM 714 OF 1240 *** - - // Wavefunction(s) for diagram number 714 - // (none) - - // Amplitude(s) for diagram number 714 - FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 714 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 715 OF 1240 *** - - // Wavefunction(s) for diagram number 715 - // (none) - - // Amplitude(s) for diagram number 715 - FFV1_0( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 715 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 716 OF 1240 *** - - // Wavefunction(s) for diagram number 716 - // (none) - - // Amplitude(s) for diagram number 716 - VVV1_0( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 716 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 717 OF 1240 *** - - // Wavefunction(s) for diagram number 717 - // (none) - - // Amplitude(s) for diagram number 717 - FFV1_0( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 717 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - - // *** DIAGRAM 718 OF 1240 *** - - // Wavefunction(s) for diagram number 718 - // (none) - - // Amplitude(s) for diagram number 718 - FFV1_0( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 718 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 719 OF 1240 *** - - // Wavefunction(s) for diagram number 719 - // (none) - - // Amplitude(s) for diagram number 719 - FFV1_0( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 719 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 720 OF 1240 *** - - // Wavefunction(s) for diagram number 720 - // (none) - - // Amplitude(s) for diagram number 720 - FFV1_0( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 720 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - - // *** DIAGRAM 721 OF 1240 *** - - // Wavefunction(s) for diagram number 721 - // (none) - - // Amplitude(s) for diagram number 721 - VVV1_0( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 721 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 722 OF 1240 *** - - // Wavefunction(s) for diagram number 722 - // (none) - - // Amplitude(s) for diagram number 722 - FFV1_0( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 723 OF 1240 *** - - // Wavefunction(s) for diagram number 723 - // (none) - - // Amplitude(s) for diagram number 723 - VVV1_0( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 723 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 724 OF 1240 *** - - // Wavefunction(s) for diagram number 724 - // (none) - - // Amplitude(s) for diagram number 724 - FFV1_0( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 724 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - - // *** DIAGRAM 725 OF 1240 *** - - // Wavefunction(s) for diagram number 725 - // (none) - - // Amplitude(s) for diagram number 725 - FFV1_0( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 725 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 726 OF 1240 *** - - // Wavefunction(s) for diagram number 726 - // (none) - - // Amplitude(s) for diagram number 726 - FFV1_0( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 726 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 727 OF 1240 *** - - // Wavefunction(s) for diagram number 727 - // (none) - - // Amplitude(s) for diagram number 727 - FFV1_0( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 727 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - - // *** DIAGRAM 728 OF 1240 *** - - // Wavefunction(s) for diagram number 728 - // (none) - - // Amplitude(s) for diagram number 728 - VVV1_0( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 728 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 729 OF 1240 *** - - // Wavefunction(s) for diagram number 729 - // (none) - - // Amplitude(s) for diagram number 729 - FFV1_0( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 730 OF 1240 *** - - // Wavefunction(s) for diagram number 730 - // (none) - - // Amplitude(s) for diagram number 730 - FFV1_0( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 730 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 731 OF 1240 *** - - // Wavefunction(s) for diagram number 731 - // (none) - - // Amplitude(s) for diagram number 731 - FFV1_0( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 731 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 732 OF 1240 *** - - // Wavefunction(s) for diagram number 732 - // (none) - - // Amplitude(s) for diagram number 732 - FFV1_0( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 732 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 733 OF 1240 *** - - // Wavefunction(s) for diagram number 733 - // (none) - - // Amplitude(s) for diagram number 733 - FFV1_0( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 733 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - - // *** DIAGRAM 734 OF 1240 *** - - // Wavefunction(s) for diagram number 734 - // (none) - - // Amplitude(s) for diagram number 734 - FFV1_0( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 734 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 735 OF 1240 *** - - // Wavefunction(s) for diagram number 735 - // (none) - - // Amplitude(s) for diagram number 735 - FFV1_0( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 735 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - - // *** DIAGRAM 736 OF 1240 *** - - // Wavefunction(s) for diagram number 736 - // (none) - - // Amplitude(s) for diagram number 736 - FFV1_0( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 737 OF 1240 *** - - // Wavefunction(s) for diagram number 737 - // (none) - - // Amplitude(s) for diagram number 737 - FFV1_0( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 738 OF 1240 *** - - // Wavefunction(s) for diagram number 738 - // (none) - - // Amplitude(s) for diagram number 738 - VVV1_0( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVV1_0( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 739 OF 1240 *** - - // Wavefunction(s) for diagram number 739 - FFV1_1( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] ); - - // Amplitude(s) for diagram number 739 - FFV1_0( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 739 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[29] -= amp_sv[0]; - - // *** DIAGRAM 740 OF 1240 *** - - // Wavefunction(s) for diagram number 740 - // (none) - - // Amplitude(s) for diagram number 740 - FFV1_0( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 740 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[27] -= amp_sv[0]; - - // *** DIAGRAM 741 OF 1240 *** - - // Wavefunction(s) for diagram number 741 - FFV1_2( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 741 - FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 741 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] -= amp_sv[0]; - - // *** DIAGRAM 742 OF 1240 *** - - // Wavefunction(s) for diagram number 742 - // (none) - - // Amplitude(s) for diagram number 742 - FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 742 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[46] -= amp_sv[0]; - - // *** DIAGRAM 743 OF 1240 *** - - // Wavefunction(s) for diagram number 743 - // (none) - - // Amplitude(s) for diagram number 743 - FFV1_0( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 743 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[37] -= amp_sv[0]; - - // *** DIAGRAM 744 OF 1240 *** - - // Wavefunction(s) for diagram number 744 - // (none) - - // Amplitude(s) for diagram number 744 - FFV1_0( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 744 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[43] -= amp_sv[0]; - - // *** DIAGRAM 745 OF 1240 *** - - // Wavefunction(s) for diagram number 745 - // (none) - - // Amplitude(s) for diagram number 745 - FFV1_0( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 745 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 746 OF 1240 *** - - // Wavefunction(s) for diagram number 746 - // (none) - - // Amplitude(s) for diagram number 746 - FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 746 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 747 OF 1240 *** - - // Wavefunction(s) for diagram number 747 - VVV1P0_1( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] ); - - // Amplitude(s) for diagram number 747 - FFV1_0( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 747 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - - // *** DIAGRAM 748 OF 1240 *** - - // Wavefunction(s) for diagram number 748 - // (none) - - // Amplitude(s) for diagram number 748 - FFV1_0( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 748 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[28] -= amp_sv[0]; - - // *** DIAGRAM 749 OF 1240 *** - - // Wavefunction(s) for diagram number 749 - // (none) - - // Amplitude(s) for diagram number 749 - FFV1_0( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 749 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] -= amp_sv[0]; - - // *** DIAGRAM 750 OF 1240 *** - - // Wavefunction(s) for diagram number 750 - FFV1_2( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); - - // Amplitude(s) for diagram number 750 - FFV1_0( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 750 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] -= amp_sv[0]; - - // *** DIAGRAM 751 OF 1240 *** - - // Wavefunction(s) for diagram number 751 - // (none) - - // Amplitude(s) for diagram number 751 - FFV1_0( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 751 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[44] -= amp_sv[0]; - - // *** DIAGRAM 752 OF 1240 *** - - // Wavefunction(s) for diagram number 752 - // (none) - - // Amplitude(s) for diagram number 752 - FFV1_0( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 752 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[31] -= amp_sv[0]; - - // *** DIAGRAM 753 OF 1240 *** - - // Wavefunction(s) for diagram number 753 - // (none) - - // Amplitude(s) for diagram number 753 - FFV1_0( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 753 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] -= amp_sv[0]; - - // *** DIAGRAM 754 OF 1240 *** - - // Wavefunction(s) for diagram number 754 - // (none) - - // Amplitude(s) for diagram number 754 - FFV1_0( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 754 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 755 OF 1240 *** - - // Wavefunction(s) for diagram number 755 - // (none) - - // Amplitude(s) for diagram number 755 - FFV1_0( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 755 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 756 OF 1240 *** - - // Wavefunction(s) for diagram number 756 - VVV1P0_1( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] ); - - // Amplitude(s) for diagram number 756 - FFV1_0( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 756 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - - // *** DIAGRAM 757 OF 1240 *** - - // Wavefunction(s) for diagram number 757 - // (none) - - // Amplitude(s) for diagram number 757 - FFV1_0( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 757 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[26] -= amp_sv[0]; - - // *** DIAGRAM 758 OF 1240 *** - - // Wavefunction(s) for diagram number 758 - // (none) - - // Amplitude(s) for diagram number 758 - FFV1_0( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 758 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] -= amp_sv[0]; - - // *** DIAGRAM 759 OF 1240 *** - - // Wavefunction(s) for diagram number 759 - FFV1_2( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); - - // Amplitude(s) for diagram number 759 - FFV1_0( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 759 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] -= amp_sv[0]; - - // *** DIAGRAM 760 OF 1240 *** - - // Wavefunction(s) for diagram number 760 - // (none) - - // Amplitude(s) for diagram number 760 - FFV1_0( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 760 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[38] -= amp_sv[0]; - - // *** DIAGRAM 761 OF 1240 *** - - // Wavefunction(s) for diagram number 761 - // (none) - - // Amplitude(s) for diagram number 761 - FFV1_0( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 761 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] -= amp_sv[0]; - - // *** DIAGRAM 762 OF 1240 *** - - // Wavefunction(s) for diagram number 762 - // (none) - - // Amplitude(s) for diagram number 762 - FFV1_0( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 762 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] -= amp_sv[0]; - - // *** DIAGRAM 763 OF 1240 *** - - // Wavefunction(s) for diagram number 763 - // (none) - - // Amplitude(s) for diagram number 763 - FFV1_0( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 763 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 764 OF 1240 *** - - // Wavefunction(s) for diagram number 764 - // (none) - - // Amplitude(s) for diagram number 764 - FFV1_0( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 764 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 765 OF 1240 *** - - // Wavefunction(s) for diagram number 765 - VVV1P0_1( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] ); - - // Amplitude(s) for diagram number 765 - FFV1_0( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 765 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - - // *** DIAGRAM 766 OF 1240 *** - - // Wavefunction(s) for diagram number 766 - // (none) - - // Amplitude(s) for diagram number 766 - FFV1_0( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 766 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 767 OF 1240 *** - - // Wavefunction(s) for diagram number 767 - // (none) - - // Amplitude(s) for diagram number 767 - FFV1_0( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 767 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - - // *** DIAGRAM 768 OF 1240 *** - - // Wavefunction(s) for diagram number 768 - // (none) - - // Amplitude(s) for diagram number 768 - VVV1_0( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 768 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 769 OF 1240 *** - - // Wavefunction(s) for diagram number 769 - // (none) - - // Amplitude(s) for diagram number 769 - FFV1_0( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 769 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 770 OF 1240 *** - - // Wavefunction(s) for diagram number 770 - // (none) - - // Amplitude(s) for diagram number 770 - VVV1_0( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 770 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 771 OF 1240 *** - - // Wavefunction(s) for diagram number 771 - // (none) - - // Amplitude(s) for diagram number 771 - FFV1_0( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 771 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 772 OF 1240 *** - - // Wavefunction(s) for diagram number 772 - VVVV1P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] ); - VVVV3P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] ); - VVVV4P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 772 - FFV1_0( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 773 OF 1240 *** - - // Wavefunction(s) for diagram number 773 - // (none) - - // Amplitude(s) for diagram number 773 - FFV1_0( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 773 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 774 OF 1240 *** - - // Wavefunction(s) for diagram number 774 - // (none) - - // Amplitude(s) for diagram number 774 - FFV1_0( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 774 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - - // *** DIAGRAM 775 OF 1240 *** - - // Wavefunction(s) for diagram number 775 - // (none) - - // Amplitude(s) for diagram number 775 - VVV1_0( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 775 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 776 OF 1240 *** - - // Wavefunction(s) for diagram number 776 - // (none) - - // Amplitude(s) for diagram number 776 - FFV1_0( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 776 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - - // *** DIAGRAM 777 OF 1240 *** - - // Wavefunction(s) for diagram number 777 - // (none) - - // Amplitude(s) for diagram number 777 - VVV1_0( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 777 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 778 OF 1240 *** - - // Wavefunction(s) for diagram number 778 - // (none) - - // Amplitude(s) for diagram number 778 - FFV1_0( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 778 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 779 OF 1240 *** - - // Wavefunction(s) for diagram number 779 - VVVV1P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] ); - VVVV3P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); - VVVV4P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); - - // Amplitude(s) for diagram number 779 - FFV1_0( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 780 OF 1240 *** - - // Wavefunction(s) for diagram number 780 - // (none) - - // Amplitude(s) for diagram number 780 - FFV1_0( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 780 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 781 OF 1240 *** - - // Wavefunction(s) for diagram number 781 - // (none) - - // Amplitude(s) for diagram number 781 - FFV1_0( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 781 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - - // *** DIAGRAM 782 OF 1240 *** - - // Wavefunction(s) for diagram number 782 - // (none) - - // Amplitude(s) for diagram number 782 - VVV1_0( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 782 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 783 OF 1240 *** - - // Wavefunction(s) for diagram number 783 - // (none) - - // Amplitude(s) for diagram number 783 - FFV1_0( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 783 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - - // *** DIAGRAM 784 OF 1240 *** - - // Wavefunction(s) for diagram number 784 - // (none) - - // Amplitude(s) for diagram number 784 - VVV1_0( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 784 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 785 OF 1240 *** - - // Wavefunction(s) for diagram number 785 - // (none) - - // Amplitude(s) for diagram number 785 - FFV1_0( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 785 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 786 OF 1240 *** - - // Wavefunction(s) for diagram number 786 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 786 - FFV1_0( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 787 OF 1240 *** - - // Wavefunction(s) for diagram number 787 - // (none) - - // Amplitude(s) for diagram number 787 - FFV1_0( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - - // *** DIAGRAM 788 OF 1240 *** - - // Wavefunction(s) for diagram number 788 - VVV1P0_1( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] ); - VVV1P0_1( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] ); - VVV1P0_1( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] ); - - // Amplitude(s) for diagram number 788 - FFV1_0( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 789 OF 1240 *** - - // Wavefunction(s) for diagram number 789 - FFV1_2( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); - - // Amplitude(s) for diagram number 789 - FFV1_0( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 789 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] -= amp_sv[0]; - - // *** DIAGRAM 790 OF 1240 *** - - // Wavefunction(s) for diagram number 790 - // (none) - - // Amplitude(s) for diagram number 790 - FFV1_0( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 790 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[70] -= amp_sv[0]; - - // *** DIAGRAM 791 OF 1240 *** - - // Wavefunction(s) for diagram number 791 - FFV1_1( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); - - // Amplitude(s) for diagram number 791 - FFV1_0( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 791 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[53] -= amp_sv[0]; - - // *** DIAGRAM 792 OF 1240 *** - - // Wavefunction(s) for diagram number 792 - // (none) - - // Amplitude(s) for diagram number 792 - FFV1_0( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 792 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[51] -= amp_sv[0]; - - // *** DIAGRAM 793 OF 1240 *** - - // Wavefunction(s) for diagram number 793 - // (none) - - // Amplitude(s) for diagram number 793 - FFV1_0( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 793 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[67] -= amp_sv[0]; - - // *** DIAGRAM 794 OF 1240 *** - - // Wavefunction(s) for diagram number 794 - // (none) - - // Amplitude(s) for diagram number 794 - FFV1_0( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 794 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[61] -= amp_sv[0]; - - // *** DIAGRAM 795 OF 1240 *** - - // Wavefunction(s) for diagram number 795 - // (none) - - // Amplitude(s) for diagram number 795 - FFV1_0( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 795 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 796 OF 1240 *** - - // Wavefunction(s) for diagram number 796 - // (none) - - // Amplitude(s) for diagram number 796 - FFV1_0( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 796 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 797 OF 1240 *** - - // Wavefunction(s) for diagram number 797 - // (none) - - // Amplitude(s) for diagram number 797 - FFV1_0( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 797 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - - // *** DIAGRAM 798 OF 1240 *** - - // Wavefunction(s) for diagram number 798 - // (none) - - // Amplitude(s) for diagram number 798 - FFV1_0( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 798 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[88] -= amp_sv[0]; - - // *** DIAGRAM 799 OF 1240 *** - - // Wavefunction(s) for diagram number 799 - // (none) - - // Amplitude(s) for diagram number 799 - FFV1_0( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 799 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[94] -= amp_sv[0]; - - // *** DIAGRAM 800 OF 1240 *** - - // Wavefunction(s) for diagram number 800 - FFV1_1( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); - - // Amplitude(s) for diagram number 800 - FFV1_0( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 800 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[77] -= amp_sv[0]; - - // *** DIAGRAM 801 OF 1240 *** - - // Wavefunction(s) for diagram number 801 - // (none) - - // Amplitude(s) for diagram number 801 - FFV1_0( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 801 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[75] -= amp_sv[0]; - - // *** DIAGRAM 802 OF 1240 *** - - // Wavefunction(s) for diagram number 802 - // (none) - - // Amplitude(s) for diagram number 802 - FFV1_0( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 802 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[91] -= amp_sv[0]; - - // *** DIAGRAM 803 OF 1240 *** - - // Wavefunction(s) for diagram number 803 - // (none) - - // Amplitude(s) for diagram number 803 - FFV1_0( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 803 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[85] -= amp_sv[0]; - - // *** DIAGRAM 804 OF 1240 *** - - // Wavefunction(s) for diagram number 804 - // (none) - - // Amplitude(s) for diagram number 804 - FFV1_0( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 804 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 805 OF 1240 *** - - // Wavefunction(s) for diagram number 805 - // (none) - - // Amplitude(s) for diagram number 805 - FFV1_0( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 805 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 806 OF 1240 *** - - // Wavefunction(s) for diagram number 806 - // (none) - - // Amplitude(s) for diagram number 806 - FFV1_0( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 806 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - - // *** DIAGRAM 807 OF 1240 *** - - // Wavefunction(s) for diagram number 807 - // (none) - - // Amplitude(s) for diagram number 807 - FFV1_0( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 807 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[112] -= amp_sv[0]; - - // *** DIAGRAM 808 OF 1240 *** - - // Wavefunction(s) for diagram number 808 - // (none) - - // Amplitude(s) for diagram number 808 - FFV1_0( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 808 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 809 OF 1240 *** - - // Wavefunction(s) for diagram number 809 - FFV1_1( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] ); - - // Amplitude(s) for diagram number 809 - FFV1_0( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 809 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 810 OF 1240 *** - - // Wavefunction(s) for diagram number 810 - // (none) - - // Amplitude(s) for diagram number 810 - FFV1_0( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 810 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 811 OF 1240 *** - - // Wavefunction(s) for diagram number 811 - // (none) - - // Amplitude(s) for diagram number 811 - FFV1_0( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 811 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 812 OF 1240 *** - - // Wavefunction(s) for diagram number 812 - // (none) - - // Amplitude(s) for diagram number 812 - FFV1_0( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 812 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[109] -= amp_sv[0]; - - // *** DIAGRAM 813 OF 1240 *** - - // Wavefunction(s) for diagram number 813 - // (none) - - // Amplitude(s) for diagram number 813 - FFV1_0( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 813 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 814 OF 1240 *** - - // Wavefunction(s) for diagram number 814 - // (none) - - // Amplitude(s) for diagram number 814 - FFV1_0( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 814 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 815 OF 1240 *** - - // Wavefunction(s) for diagram number 815 - // (none) - - // Amplitude(s) for diagram number 815 - FFV1_0( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 815 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 816 OF 1240 *** - - // Wavefunction(s) for diagram number 816 - // (none) - - // Amplitude(s) for diagram number 816 - FFV1_0( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 816 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 817 OF 1240 *** - - // Wavefunction(s) for diagram number 817 - // (none) - - // Amplitude(s) for diagram number 817 - FFV1_0( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 817 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 818 OF 1240 *** - - // Wavefunction(s) for diagram number 818 - // (none) - - // Amplitude(s) for diagram number 818 - VVV1_0( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 818 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 819 OF 1240 *** - - // Wavefunction(s) for diagram number 819 - // (none) - - // Amplitude(s) for diagram number 819 - FFV1_0( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 819 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - - // *** DIAGRAM 820 OF 1240 *** - - // Wavefunction(s) for diagram number 820 - // (none) - - // Amplitude(s) for diagram number 820 - VVV1_0( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 820 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 821 OF 1240 *** - - // Wavefunction(s) for diagram number 821 - // (none) - - // Amplitude(s) for diagram number 821 - FFV1_0( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 821 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 822 OF 1240 *** - - // Wavefunction(s) for diagram number 822 - // (none) - - // Amplitude(s) for diagram number 822 - FFV1_0( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 823 OF 1240 *** - - // Wavefunction(s) for diagram number 823 - // (none) - - // Amplitude(s) for diagram number 823 - FFV1_0( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 823 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 824 OF 1240 *** - - // Wavefunction(s) for diagram number 824 - // (none) - - // Amplitude(s) for diagram number 824 - FFV1_0( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 824 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[70] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - - // *** DIAGRAM 825 OF 1240 *** - - // Wavefunction(s) for diagram number 825 - // (none) - - // Amplitude(s) for diagram number 825 - VVV1_0( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 825 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 826 OF 1240 *** - - // Wavefunction(s) for diagram number 826 - // (none) - - // Amplitude(s) for diagram number 826 - FFV1_0( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 826 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 827 OF 1240 *** - - // Wavefunction(s) for diagram number 827 - // (none) - - // Amplitude(s) for diagram number 827 - VVV1_0( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 827 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 828 OF 1240 *** - - // Wavefunction(s) for diagram number 828 - // (none) - - // Amplitude(s) for diagram number 828 - FFV1_0( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 828 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 829 OF 1240 *** - - // Wavefunction(s) for diagram number 829 - // (none) - - // Amplitude(s) for diagram number 829 - FFV1_0( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 830 OF 1240 *** - - // Wavefunction(s) for diagram number 830 - // (none) - - // Amplitude(s) for diagram number 830 - FFV1_0( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 830 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 831 OF 1240 *** - - // Wavefunction(s) for diagram number 831 - // (none) - - // Amplitude(s) for diagram number 831 - FFV1_0( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 831 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 832 OF 1240 *** - - // Wavefunction(s) for diagram number 832 - // (none) - - // Amplitude(s) for diagram number 832 - VVV1_0( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 832 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 833 OF 1240 *** - - // Wavefunction(s) for diagram number 833 - // (none) - - // Amplitude(s) for diagram number 833 - FFV1_0( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 833 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 834 OF 1240 *** - - // Wavefunction(s) for diagram number 834 - // (none) - - // Amplitude(s) for diagram number 834 - VVV1_0( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 834 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 835 OF 1240 *** - - // Wavefunction(s) for diagram number 835 - // (none) - - // Amplitude(s) for diagram number 835 - FFV1_0( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 835 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 836 OF 1240 *** - - // Wavefunction(s) for diagram number 836 - // (none) - - // Amplitude(s) for diagram number 836 - FFV1_0( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 837 OF 1240 *** - - // Wavefunction(s) for diagram number 837 - // (none) - - // Amplitude(s) for diagram number 837 - FFV1_0( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - FFV1_0( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - FFV1_0( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 838 OF 1240 *** - - // Wavefunction(s) for diagram number 838 - // (none) - - // Amplitude(s) for diagram number 838 - FFV1_0( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 839 OF 1240 *** - - // Wavefunction(s) for diagram number 839 - VVV1P0_1( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] ); - - // Amplitude(s) for diagram number 839 - VVV1_0( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 839 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 840 OF 1240 *** - - // Wavefunction(s) for diagram number 840 - // (none) - - // Amplitude(s) for diagram number 840 - VVV1_0( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 840 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 841 OF 1240 *** - - // Wavefunction(s) for diagram number 841 - // (none) - - // Amplitude(s) for diagram number 841 - VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 842 OF 1240 *** - - // Wavefunction(s) for diagram number 842 - VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] ); - - // Amplitude(s) for diagram number 842 - VVV1_0( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 842 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 843 OF 1240 *** - - // Wavefunction(s) for diagram number 843 - // (none) - - // Amplitude(s) for diagram number 843 - VVV1_0( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 843 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 844 OF 1240 *** - - // Wavefunction(s) for diagram number 844 - // (none) - - // Amplitude(s) for diagram number 844 - VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 845 OF 1240 *** - - // Wavefunction(s) for diagram number 845 - // (none) - - // Amplitude(s) for diagram number 845 - VVV1_0( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 845 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 846 OF 1240 *** - - // Wavefunction(s) for diagram number 846 - // (none) - - // Amplitude(s) for diagram number 846 - VVV1_0( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 846 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 847 OF 1240 *** - - // Wavefunction(s) for diagram number 847 - VVVV1P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] ); - VVVV3P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] ); - VVVV4P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 847 - VVV1_0( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 848 OF 1240 *** - - // Wavefunction(s) for diagram number 848 - VVVV1P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] ); - VVVV3P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] ); - VVVV4P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] ); - - // Amplitude(s) for diagram number 848 - VVV1_0( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 849 OF 1240 *** - - // Wavefunction(s) for diagram number 849 - VVVV1P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] ); - VVVV3P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] ); - VVVV4P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] ); - - // Amplitude(s) for diagram number 849 - VVV1_0( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVV1_0( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVV1_0( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 850 OF 1240 *** - - // Wavefunction(s) for diagram number 850 - VVVV1P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] ); - VVVV3P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] ); - VVVV4P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] ); - - // Amplitude(s) for diagram number 850 - VVV1_0( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVV1_0( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 851 OF 1240 *** - - // Wavefunction(s) for diagram number 851 - // (none) - - // Amplitude(s) for diagram number 851 - VVVV1_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 852 OF 1240 *** - - // Wavefunction(s) for diagram number 852 - // (none) - - // Amplitude(s) for diagram number 852 - VVV1_0( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 852 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 853 OF 1240 *** - - // Wavefunction(s) for diagram number 853 - // (none) - - // Amplitude(s) for diagram number 853 - VVV1_0( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 853 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 854 OF 1240 *** - - // Wavefunction(s) for diagram number 854 - // (none) - - // Amplitude(s) for diagram number 854 - VVV1_0( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 854 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 855 OF 1240 *** - - // Wavefunction(s) for diagram number 855 - // (none) - - // Amplitude(s) for diagram number 855 - VVV1_0( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 855 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 856 OF 1240 *** - - // Wavefunction(s) for diagram number 856 - // (none) - - // Amplitude(s) for diagram number 856 - FFV1_0( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 856 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 857 OF 1240 *** - - // Wavefunction(s) for diagram number 857 - // (none) - - // Amplitude(s) for diagram number 857 - FFV1_0( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 857 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 858 OF 1240 *** - - // Wavefunction(s) for diagram number 858 - // (none) - - // Amplitude(s) for diagram number 858 - FFV1_0( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 858 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - - // *** DIAGRAM 859 OF 1240 *** - - // Wavefunction(s) for diagram number 859 - // (none) - - // Amplitude(s) for diagram number 859 - FFV1_0( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 859 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 860 OF 1240 *** - - // Wavefunction(s) for diagram number 860 - // (none) - - // Amplitude(s) for diagram number 860 - VVV1_0( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 860 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 861 OF 1240 *** - - // Wavefunction(s) for diagram number 861 - // (none) - - // Amplitude(s) for diagram number 861 - FFV1_0( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 862 OF 1240 *** - - // Wavefunction(s) for diagram number 862 - // (none) - - // Amplitude(s) for diagram number 862 - FFV1_0( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 862 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 863 OF 1240 *** - - // Wavefunction(s) for diagram number 863 - // (none) - - // Amplitude(s) for diagram number 863 - FFV1_0( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 863 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 864 OF 1240 *** - - // Wavefunction(s) for diagram number 864 - // (none) - - // Amplitude(s) for diagram number 864 - FFV1_0( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 864 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 865 OF 1240 *** - - // Wavefunction(s) for diagram number 865 - // (none) - - // Amplitude(s) for diagram number 865 - VVV1_0( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 865 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 866 OF 1240 *** - - // Wavefunction(s) for diagram number 866 - // (none) - - // Amplitude(s) for diagram number 866 - FFV1_0( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 866 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 867 OF 1240 *** - - // Wavefunction(s) for diagram number 867 - // (none) - - // Amplitude(s) for diagram number 867 - FFV1_0( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 867 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 868 OF 1240 *** - - // Wavefunction(s) for diagram number 868 - // (none) - - // Amplitude(s) for diagram number 868 - FFV1_0( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 868 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - - // *** DIAGRAM 869 OF 1240 *** - - // Wavefunction(s) for diagram number 869 - // (none) - - // Amplitude(s) for diagram number 869 - FFV1_0( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 869 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 870 OF 1240 *** - - // Wavefunction(s) for diagram number 870 - // (none) - - // Amplitude(s) for diagram number 870 - VVV1_0( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 870 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 871 OF 1240 *** - - // Wavefunction(s) for diagram number 871 - // (none) - - // Amplitude(s) for diagram number 871 - FFV1_0( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 872 OF 1240 *** - - // Wavefunction(s) for diagram number 872 - // (none) - - // Amplitude(s) for diagram number 872 - FFV1_0( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 872 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 873 OF 1240 *** - - // Wavefunction(s) for diagram number 873 - // (none) - - // Amplitude(s) for diagram number 873 - FFV1_0( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 873 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 874 OF 1240 *** - - // Wavefunction(s) for diagram number 874 - // (none) - - // Amplitude(s) for diagram number 874 - FFV1_0( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 874 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 875 OF 1240 *** - - // Wavefunction(s) for diagram number 875 - // (none) - - // Amplitude(s) for diagram number 875 - VVV1_0( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 875 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 876 OF 1240 *** - - // Wavefunction(s) for diagram number 876 - // (none) - - // Amplitude(s) for diagram number 876 - FFV1_0( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 876 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - - // *** DIAGRAM 877 OF 1240 *** - - // Wavefunction(s) for diagram number 877 - // (none) - - // Amplitude(s) for diagram number 877 - FFV1_0( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 877 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 878 OF 1240 *** - - // Wavefunction(s) for diagram number 878 - // (none) - - // Amplitude(s) for diagram number 878 - FFV1_0( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 878 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 879 OF 1240 *** - - // Wavefunction(s) for diagram number 879 - // (none) - - // Amplitude(s) for diagram number 879 - FFV1_0( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 879 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 880 OF 1240 *** - - // Wavefunction(s) for diagram number 880 - // (none) - - // Amplitude(s) for diagram number 880 - VVV1_0( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 880 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 881 OF 1240 *** - - // Wavefunction(s) for diagram number 881 - // (none) - - // Amplitude(s) for diagram number 881 - FFV1_0( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 882 OF 1240 *** - - // Wavefunction(s) for diagram number 882 - // (none) - - // Amplitude(s) for diagram number 882 - VVV1_0( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 882 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 883 OF 1240 *** - - // Wavefunction(s) for diagram number 883 - // (none) - - // Amplitude(s) for diagram number 883 - FFV1_0( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 883 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - - // *** DIAGRAM 884 OF 1240 *** - - // Wavefunction(s) for diagram number 884 - // (none) - - // Amplitude(s) for diagram number 884 - FFV1_0( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 884 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 885 OF 1240 *** - - // Wavefunction(s) for diagram number 885 - // (none) - - // Amplitude(s) for diagram number 885 - FFV1_0( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 885 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 886 OF 1240 *** - - // Wavefunction(s) for diagram number 886 - // (none) - - // Amplitude(s) for diagram number 886 - FFV1_0( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 886 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 887 OF 1240 *** - - // Wavefunction(s) for diagram number 887 - // (none) - - // Amplitude(s) for diagram number 887 - VVV1_0( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 887 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 888 OF 1240 *** - - // Wavefunction(s) for diagram number 888 - // (none) - - // Amplitude(s) for diagram number 888 - FFV1_0( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 889 OF 1240 *** - - // Wavefunction(s) for diagram number 889 - // (none) - - // Amplitude(s) for diagram number 889 - FFV1_0( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 889 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 890 OF 1240 *** - - // Wavefunction(s) for diagram number 890 - // (none) - - // Amplitude(s) for diagram number 890 - FFV1_0( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 890 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 891 OF 1240 *** - - // Wavefunction(s) for diagram number 891 - // (none) - - // Amplitude(s) for diagram number 891 - FFV1_0( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 891 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 892 OF 1240 *** - - // Wavefunction(s) for diagram number 892 - // (none) - - // Amplitude(s) for diagram number 892 - FFV1_0( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 892 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 893 OF 1240 *** - - // Wavefunction(s) for diagram number 893 - // (none) - - // Amplitude(s) for diagram number 893 - FFV1_0( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 893 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - - // *** DIAGRAM 894 OF 1240 *** - - // Wavefunction(s) for diagram number 894 - // (none) - - // Amplitude(s) for diagram number 894 - FFV1_0( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 894 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 895 OF 1240 *** - - // Wavefunction(s) for diagram number 895 - VVV1P0_1( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] ); - - // Amplitude(s) for diagram number 895 - VVV1_0( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 895 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 896 OF 1240 *** - - // Wavefunction(s) for diagram number 896 - // (none) - - // Amplitude(s) for diagram number 896 - VVV1_0( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 896 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 897 OF 1240 *** - - // Wavefunction(s) for diagram number 897 - // (none) - - // Amplitude(s) for diagram number 897 - VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 898 OF 1240 *** - - // Wavefunction(s) for diagram number 898 - // (none) - - // Amplitude(s) for diagram number 898 - VVV1_0( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 898 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 899 OF 1240 *** - - // Wavefunction(s) for diagram number 899 - // (none) - - // Amplitude(s) for diagram number 899 - VVV1_0( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 899 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 900 OF 1240 *** - - // Wavefunction(s) for diagram number 900 - // (none) - - // Amplitude(s) for diagram number 900 - VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 901 OF 1240 *** - - // Wavefunction(s) for diagram number 901 - // (none) - - // Amplitude(s) for diagram number 901 - VVV1_0( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 901 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 902 OF 1240 *** - - // Wavefunction(s) for diagram number 902 - // (none) - - // Amplitude(s) for diagram number 902 - VVV1_0( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 902 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 903 OF 1240 *** - - // Wavefunction(s) for diagram number 903 - VVVV1P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] ); - VVVV3P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] ); - VVVV4P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 903 - VVV1_0( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 904 OF 1240 *** - - // Wavefunction(s) for diagram number 904 - VVVV1P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] ); - VVVV3P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] ); - VVVV4P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] ); - - // Amplitude(s) for diagram number 904 - VVV1_0( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 905 OF 1240 *** - - // Wavefunction(s) for diagram number 905 - VVVV1P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] ); - VVVV3P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] ); - VVVV4P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] ); - - // Amplitude(s) for diagram number 905 - VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVV1_0( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 906 OF 1240 *** - - // Wavefunction(s) for diagram number 906 - // (none) - - // Amplitude(s) for diagram number 906 - VVV1_0( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVV1_0( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - VVV1_0( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 907 OF 1240 *** - - // Wavefunction(s) for diagram number 907 - // (none) - - // Amplitude(s) for diagram number 907 - VVVV1_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 908 OF 1240 *** - - // Wavefunction(s) for diagram number 908 - // (none) - - // Amplitude(s) for diagram number 908 - VVV1_0( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 908 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 909 OF 1240 *** - - // Wavefunction(s) for diagram number 909 - // (none) - - // Amplitude(s) for diagram number 909 - VVV1_0( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 909 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 910 OF 1240 *** - - // Wavefunction(s) for diagram number 910 - // (none) - - // Amplitude(s) for diagram number 910 - VVV1_0( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 910 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 911 OF 1240 *** - - // Wavefunction(s) for diagram number 911 - // (none) - - // Amplitude(s) for diagram number 911 - VVV1_0( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 911 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 912 OF 1240 *** - - // Wavefunction(s) for diagram number 912 - // (none) - - // Amplitude(s) for diagram number 912 - FFV1_0( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 912 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 913 OF 1240 *** - - // Wavefunction(s) for diagram number 913 - // (none) - - // Amplitude(s) for diagram number 913 - FFV1_0( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 913 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 914 OF 1240 *** - - // Wavefunction(s) for diagram number 914 - // (none) - - // Amplitude(s) for diagram number 914 - FFV1_0( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 914 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - - // *** DIAGRAM 915 OF 1240 *** - - // Wavefunction(s) for diagram number 915 - // (none) - - // Amplitude(s) for diagram number 915 - FFV1_0( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 915 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 916 OF 1240 *** - - // Wavefunction(s) for diagram number 916 - // (none) - - // Amplitude(s) for diagram number 916 - VVV1_0( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 916 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 917 OF 1240 *** - - // Wavefunction(s) for diagram number 917 - // (none) - - // Amplitude(s) for diagram number 917 - FFV1_0( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 918 OF 1240 *** - - // Wavefunction(s) for diagram number 918 - // (none) - - // Amplitude(s) for diagram number 918 - FFV1_0( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 918 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - - // *** DIAGRAM 919 OF 1240 *** - - // Wavefunction(s) for diagram number 919 - // (none) - - // Amplitude(s) for diagram number 919 - FFV1_0( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 919 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 920 OF 1240 *** - - // Wavefunction(s) for diagram number 920 - // (none) - - // Amplitude(s) for diagram number 920 - FFV1_0( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 920 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 921 OF 1240 *** - - // Wavefunction(s) for diagram number 921 - // (none) - - // Amplitude(s) for diagram number 921 - VVV1_0( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 921 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 922 OF 1240 *** - - // Wavefunction(s) for diagram number 922 - // (none) - - // Amplitude(s) for diagram number 922 - FFV1_0( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 922 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 923 OF 1240 *** - - // Wavefunction(s) for diagram number 923 - // (none) - - // Amplitude(s) for diagram number 923 - FFV1_0( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 923 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 924 OF 1240 *** - - // Wavefunction(s) for diagram number 924 - // (none) - - // Amplitude(s) for diagram number 924 - FFV1_0( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 924 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - - // *** DIAGRAM 925 OF 1240 *** - - // Wavefunction(s) for diagram number 925 - // (none) - - // Amplitude(s) for diagram number 925 - FFV1_0( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 925 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 926 OF 1240 *** - - // Wavefunction(s) for diagram number 926 - // (none) - - // Amplitude(s) for diagram number 926 - VVV1_0( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 926 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 927 OF 1240 *** - - // Wavefunction(s) for diagram number 927 - // (none) - - // Amplitude(s) for diagram number 927 - FFV1_0( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 928 OF 1240 *** - - // Wavefunction(s) for diagram number 928 - // (none) - - // Amplitude(s) for diagram number 928 - FFV1_0( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 928 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 929 OF 1240 *** - - // Wavefunction(s) for diagram number 929 - // (none) - - // Amplitude(s) for diagram number 929 - FFV1_0( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 929 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 930 OF 1240 *** - - // Wavefunction(s) for diagram number 930 - // (none) - - // Amplitude(s) for diagram number 930 - FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 930 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 931 OF 1240 *** - - // Wavefunction(s) for diagram number 931 - // (none) - - // Amplitude(s) for diagram number 931 - VVV1_0( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 931 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 932 OF 1240 *** - - // Wavefunction(s) for diagram number 932 - // (none) - - // Amplitude(s) for diagram number 932 - FFV1_0( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 932 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - - // *** DIAGRAM 933 OF 1240 *** - - // Wavefunction(s) for diagram number 933 - // (none) - - // Amplitude(s) for diagram number 933 - FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 933 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 934 OF 1240 *** - - // Wavefunction(s) for diagram number 934 - // (none) - - // Amplitude(s) for diagram number 934 - FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 934 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 935 OF 1240 *** - - // Wavefunction(s) for diagram number 935 - // (none) - - // Amplitude(s) for diagram number 935 - FFV1_0( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 935 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 936 OF 1240 *** - - // Wavefunction(s) for diagram number 936 - // (none) - - // Amplitude(s) for diagram number 936 - VVV1_0( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 936 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 937 OF 1240 *** - - // Wavefunction(s) for diagram number 937 - // (none) - - // Amplitude(s) for diagram number 937 - FFV1_0( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 938 OF 1240 *** - - // Wavefunction(s) for diagram number 938 - // (none) - - // Amplitude(s) for diagram number 938 - VVV1_0( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 938 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 939 OF 1240 *** - - // Wavefunction(s) for diagram number 939 - // (none) - - // Amplitude(s) for diagram number 939 - FFV1_0( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 939 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - - // *** DIAGRAM 940 OF 1240 *** - - // Wavefunction(s) for diagram number 940 - // (none) - - // Amplitude(s) for diagram number 940 - FFV1_0( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 940 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 941 OF 1240 *** - - // Wavefunction(s) for diagram number 941 - // (none) - - // Amplitude(s) for diagram number 941 - FFV1_0( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 941 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[38] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - - // *** DIAGRAM 942 OF 1240 *** - - // Wavefunction(s) for diagram number 942 - // (none) - - // Amplitude(s) for diagram number 942 - FFV1_0( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 942 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 943 OF 1240 *** - - // Wavefunction(s) for diagram number 943 - // (none) - - // Amplitude(s) for diagram number 943 - VVV1_0( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 943 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 944 OF 1240 *** - - // Wavefunction(s) for diagram number 944 - // (none) - - // Amplitude(s) for diagram number 944 - FFV1_0( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 945 OF 1240 *** - - // Wavefunction(s) for diagram number 945 - // (none) - - // Amplitude(s) for diagram number 945 - FFV1_0( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 945 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 946 OF 1240 *** - - // Wavefunction(s) for diagram number 946 - // (none) - - // Amplitude(s) for diagram number 946 - FFV1_0( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 946 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 947 OF 1240 *** - - // Wavefunction(s) for diagram number 947 - // (none) - - // Amplitude(s) for diagram number 947 - FFV1_0( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 947 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 948 OF 1240 *** - - // Wavefunction(s) for diagram number 948 - // (none) - - // Amplitude(s) for diagram number 948 - FFV1_0( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 948 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 949 OF 1240 *** - - // Wavefunction(s) for diagram number 949 - // (none) - - // Amplitude(s) for diagram number 949 - FFV1_0( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 949 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - - // *** DIAGRAM 950 OF 1240 *** - - // Wavefunction(s) for diagram number 950 - // (none) - - // Amplitude(s) for diagram number 950 - FFV1_0( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 950 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 951 OF 1240 *** - - // Wavefunction(s) for diagram number 951 - VVV1P0_1( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] ); - - // Amplitude(s) for diagram number 951 - VVV1_0( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 951 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 952 OF 1240 *** - - // Wavefunction(s) for diagram number 952 - // (none) - - // Amplitude(s) for diagram number 952 - VVV1_0( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 952 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 953 OF 1240 *** - - // Wavefunction(s) for diagram number 953 - // (none) - - // Amplitude(s) for diagram number 953 - VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 954 OF 1240 *** - - // Wavefunction(s) for diagram number 954 - // (none) - - // Amplitude(s) for diagram number 954 - VVV1_0( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 954 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - - // *** DIAGRAM 955 OF 1240 *** - - // Wavefunction(s) for diagram number 955 - // (none) - - // Amplitude(s) for diagram number 955 - VVV1_0( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 955 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 956 OF 1240 *** - - // Wavefunction(s) for diagram number 956 - // (none) - - // Amplitude(s) for diagram number 956 - VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 957 OF 1240 *** - - // Wavefunction(s) for diagram number 957 - // (none) - - // Amplitude(s) for diagram number 957 - VVV1_0( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 957 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - - // *** DIAGRAM 958 OF 1240 *** - - // Wavefunction(s) for diagram number 958 - // (none) - - // Amplitude(s) for diagram number 958 - VVV1_0( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 958 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - - // *** DIAGRAM 959 OF 1240 *** - - // Wavefunction(s) for diagram number 959 - VVVV1P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] ); - VVVV3P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] ); - VVVV4P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 959 - VVV1_0( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 960 OF 1240 *** - - // Wavefunction(s) for diagram number 960 - VVVV1P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] ); - VVVV3P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] ); - VVVV4P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] ); - - // Amplitude(s) for diagram number 960 - VVV1_0( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 961 OF 1240 *** - - // Wavefunction(s) for diagram number 961 - // (none) - - // Amplitude(s) for diagram number 961 - VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVV1_0( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - VVV1_0( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 962 OF 1240 *** - - // Wavefunction(s) for diagram number 962 - // (none) - - // Amplitude(s) for diagram number 962 - VVV1_0( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVV1_0( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - VVV1_0( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - - // *** DIAGRAM 963 OF 1240 *** - - // Wavefunction(s) for diagram number 963 - // (none) - - // Amplitude(s) for diagram number 963 - VVVV1_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 964 OF 1240 *** - - // Wavefunction(s) for diagram number 964 - // (none) - - // Amplitude(s) for diagram number 964 - VVV1_0( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 964 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 965 OF 1240 *** - - // Wavefunction(s) for diagram number 965 - // (none) - - // Amplitude(s) for diagram number 965 - VVV1_0( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 965 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 966 OF 1240 *** - - // Wavefunction(s) for diagram number 966 - // (none) - - // Amplitude(s) for diagram number 966 - VVV1_0( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 966 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 967 OF 1240 *** - - // Wavefunction(s) for diagram number 967 - // (none) - - // Amplitude(s) for diagram number 967 - VVV1_0( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 967 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 968 OF 1240 *** - - // Wavefunction(s) for diagram number 968 - // (none) - - // Amplitude(s) for diagram number 968 - FFV1_0( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 968 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - - // *** DIAGRAM 969 OF 1240 *** - - // Wavefunction(s) for diagram number 969 - // (none) - - // Amplitude(s) for diagram number 969 - FFV1_0( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 969 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 970 OF 1240 *** - - // Wavefunction(s) for diagram number 970 - // (none) - - // Amplitude(s) for diagram number 970 - FFV1_0( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 970 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - - // *** DIAGRAM 971 OF 1240 *** - - // Wavefunction(s) for diagram number 971 - // (none) - - // Amplitude(s) for diagram number 971 - FFV1_0( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 971 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 972 OF 1240 *** - - // Wavefunction(s) for diagram number 972 - // (none) - - // Amplitude(s) for diagram number 972 - VVV1_0( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 972 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 973 OF 1240 *** - - // Wavefunction(s) for diagram number 973 - // (none) - - // Amplitude(s) for diagram number 973 - FFV1_0( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 974 OF 1240 *** - - // Wavefunction(s) for diagram number 974 - // (none) - - // Amplitude(s) for diagram number 974 - FFV1_0( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 974 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - - // *** DIAGRAM 975 OF 1240 *** - - // Wavefunction(s) for diagram number 975 - // (none) - - // Amplitude(s) for diagram number 975 - FFV1_0( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 975 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 976 OF 1240 *** - - // Wavefunction(s) for diagram number 976 - // (none) - - // Amplitude(s) for diagram number 976 - FFV1_0( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 976 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 977 OF 1240 *** - - // Wavefunction(s) for diagram number 977 - // (none) - - // Amplitude(s) for diagram number 977 - VVV1_0( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 977 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 978 OF 1240 *** - - // Wavefunction(s) for diagram number 978 - // (none) - - // Amplitude(s) for diagram number 978 - FFV1_0( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 978 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - - // *** DIAGRAM 979 OF 1240 *** - - // Wavefunction(s) for diagram number 979 - // (none) - - // Amplitude(s) for diagram number 979 - FFV1_0( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 979 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 980 OF 1240 *** - - // Wavefunction(s) for diagram number 980 - // (none) - - // Amplitude(s) for diagram number 980 - FFV1_0( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 980 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - - // *** DIAGRAM 981 OF 1240 *** - - // Wavefunction(s) for diagram number 981 - // (none) - - // Amplitude(s) for diagram number 981 - FFV1_0( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 981 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 982 OF 1240 *** - - // Wavefunction(s) for diagram number 982 - // (none) - - // Amplitude(s) for diagram number 982 - VVV1_0( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 982 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 983 OF 1240 *** - - // Wavefunction(s) for diagram number 983 - // (none) - - // Amplitude(s) for diagram number 983 - FFV1_0( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 984 OF 1240 *** - - // Wavefunction(s) for diagram number 984 - // (none) - - // Amplitude(s) for diagram number 984 - FFV1_0( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 984 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - - // *** DIAGRAM 985 OF 1240 *** - - // Wavefunction(s) for diagram number 985 - // (none) - - // Amplitude(s) for diagram number 985 - FFV1_0( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 985 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 986 OF 1240 *** - - // Wavefunction(s) for diagram number 986 - // (none) - - // Amplitude(s) for diagram number 986 - FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 986 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 987 OF 1240 *** - - // Wavefunction(s) for diagram number 987 - // (none) - - // Amplitude(s) for diagram number 987 - VVV1_0( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 987 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 988 OF 1240 *** - - // Wavefunction(s) for diagram number 988 - // (none) - - // Amplitude(s) for diagram number 988 - FFV1_0( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 988 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 989 OF 1240 *** - - // Wavefunction(s) for diagram number 989 - // (none) - - // Amplitude(s) for diagram number 989 - FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 989 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 990 OF 1240 *** - - // Wavefunction(s) for diagram number 990 - // (none) - - // Amplitude(s) for diagram number 990 - FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 990 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[46] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - - // *** DIAGRAM 991 OF 1240 *** - - // Wavefunction(s) for diagram number 991 - // (none) - - // Amplitude(s) for diagram number 991 - FFV1_0( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 991 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 992 OF 1240 *** - - // Wavefunction(s) for diagram number 992 - // (none) - - // Amplitude(s) for diagram number 992 - VVV1_0( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 992 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 993 OF 1240 *** - - // Wavefunction(s) for diagram number 993 - // (none) - - // Amplitude(s) for diagram number 993 - FFV1_0( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 994 OF 1240 *** - - // Wavefunction(s) for diagram number 994 - // (none) - - // Amplitude(s) for diagram number 994 - VVV1_0( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 994 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 995 OF 1240 *** - - // Wavefunction(s) for diagram number 995 - // (none) - - // Amplitude(s) for diagram number 995 - FFV1_0( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 995 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - - // *** DIAGRAM 996 OF 1240 *** - - // Wavefunction(s) for diagram number 996 - // (none) - - // Amplitude(s) for diagram number 996 - FFV1_0( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 996 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 997 OF 1240 *** - - // Wavefunction(s) for diagram number 997 - // (none) - - // Amplitude(s) for diagram number 997 - FFV1_0( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 997 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[44] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - - // *** DIAGRAM 998 OF 1240 *** - - // Wavefunction(s) for diagram number 998 - // (none) - - // Amplitude(s) for diagram number 998 - FFV1_0( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 998 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 999 OF 1240 *** - - // Wavefunction(s) for diagram number 999 - // (none) - - // Amplitude(s) for diagram number 999 - VVV1_0( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 999 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1000 OF 1240 *** - - // Wavefunction(s) for diagram number 1000 - // (none) - - // Amplitude(s) for diagram number 1000 - FFV1_0( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1001 OF 1240 *** - - // Wavefunction(s) for diagram number 1001 - // (none) - - // Amplitude(s) for diagram number 1001 - FFV1_0( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1001 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1002 OF 1240 *** - - // Wavefunction(s) for diagram number 1002 - // (none) - - // Amplitude(s) for diagram number 1002 - FFV1_0( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1002 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1003 OF 1240 *** - - // Wavefunction(s) for diagram number 1003 - // (none) - - // Amplitude(s) for diagram number 1003 - FFV1_0( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1003 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1004 OF 1240 *** - - // Wavefunction(s) for diagram number 1004 - // (none) - - // Amplitude(s) for diagram number 1004 - FFV1_0( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1004 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1005 OF 1240 *** - - // Wavefunction(s) for diagram number 1005 - // (none) - - // Amplitude(s) for diagram number 1005 - FFV1_0( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1005 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 1006 OF 1240 *** - - // Wavefunction(s) for diagram number 1006 - // (none) - - // Amplitude(s) for diagram number 1006 - FFV1_0( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1006 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - - // *** DIAGRAM 1007 OF 1240 *** - - // Wavefunction(s) for diagram number 1007 - // (none) - - // Amplitude(s) for diagram number 1007 - VVV1_0( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1007 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1008 OF 1240 *** - - // Wavefunction(s) for diagram number 1008 - // (none) - - // Amplitude(s) for diagram number 1008 - VVV1_0( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1008 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1009 OF 1240 *** - - // Wavefunction(s) for diagram number 1009 - // (none) - - // Amplitude(s) for diagram number 1009 - VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1010 OF 1240 *** - - // Wavefunction(s) for diagram number 1010 - // (none) - - // Amplitude(s) for diagram number 1010 - VVV1_0( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1010 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1011 OF 1240 *** - - // Wavefunction(s) for diagram number 1011 - // (none) - - // Amplitude(s) for diagram number 1011 - VVV1_0( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1011 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1012 OF 1240 *** - - // Wavefunction(s) for diagram number 1012 - // (none) - - // Amplitude(s) for diagram number 1012 - VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 1013 OF 1240 *** - - // Wavefunction(s) for diagram number 1013 - // (none) - - // Amplitude(s) for diagram number 1013 - VVV1_0( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1013 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1014 OF 1240 *** - - // Wavefunction(s) for diagram number 1014 - // (none) - - // Amplitude(s) for diagram number 1014 - VVV1_0( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1014 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1015 OF 1240 *** - - // Wavefunction(s) for diagram number 1015 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] ); - - // Amplitude(s) for diagram number 1015 - VVV1_0( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVV1_0( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVV1_0( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1016 OF 1240 *** - - // Wavefunction(s) for diagram number 1016 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 1016 - VVV1_0( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1017 OF 1240 *** - - // Wavefunction(s) for diagram number 1017 - // (none) - - // Amplitude(s) for diagram number 1017 - VVV1_0( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1018 OF 1240 *** - - // Wavefunction(s) for diagram number 1018 - // (none) - - // Amplitude(s) for diagram number 1018 - VVV1_0( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1019 OF 1240 *** - - // Wavefunction(s) for diagram number 1019 - // (none) - - // Amplitude(s) for diagram number 1019 - VVV1_0( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1019 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 1020 OF 1240 *** - - // Wavefunction(s) for diagram number 1020 - // (none) - - // Amplitude(s) for diagram number 1020 - VVV1_0( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1020 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 1021 OF 1240 *** - - // Wavefunction(s) for diagram number 1021 - // (none) - - // Amplitude(s) for diagram number 1021 - VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 1022 OF 1240 *** - - // Wavefunction(s) for diagram number 1022 - // (none) - - // Amplitude(s) for diagram number 1022 - VVV1_0( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1022 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 1023 OF 1240 *** - - // Wavefunction(s) for diagram number 1023 - // (none) - - // Amplitude(s) for diagram number 1023 - VVV1_0( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1023 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - - // *** DIAGRAM 1024 OF 1240 *** - - // Wavefunction(s) for diagram number 1024 - // (none) - - // Amplitude(s) for diagram number 1024 - VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 1025 OF 1240 *** - - // Wavefunction(s) for diagram number 1025 - // (none) - - // Amplitude(s) for diagram number 1025 - VVV1_0( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1025 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 1026 OF 1240 *** - - // Wavefunction(s) for diagram number 1026 - // (none) - - // Amplitude(s) for diagram number 1026 - VVV1_0( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1026 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 1027 OF 1240 *** - - // Wavefunction(s) for diagram number 1027 - // (none) - - // Amplitude(s) for diagram number 1027 - VVV1_0( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVV1_0( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVV1_0( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 1028 OF 1240 *** - - // Wavefunction(s) for diagram number 1028 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 1028 - VVV1_0( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1029 OF 1240 *** - - // Wavefunction(s) for diagram number 1029 - // (none) - - // Amplitude(s) for diagram number 1029 - VVV1_0( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 1030 OF 1240 *** - - // Wavefunction(s) for diagram number 1030 - // (none) - - // Amplitude(s) for diagram number 1030 - VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 1031 OF 1240 *** - - // Wavefunction(s) for diagram number 1031 - // (none) - - // Amplitude(s) for diagram number 1031 - VVV1_0( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1031 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 1032 OF 1240 *** - - // Wavefunction(s) for diagram number 1032 - // (none) - - // Amplitude(s) for diagram number 1032 - VVV1_0( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1032 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1033 OF 1240 *** - - // Wavefunction(s) for diagram number 1033 - // (none) - - // Amplitude(s) for diagram number 1033 - VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1034 OF 1240 *** - - // Wavefunction(s) for diagram number 1034 - // (none) - - // Amplitude(s) for diagram number 1034 - VVV1_0( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1034 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 1035 OF 1240 *** - - // Wavefunction(s) for diagram number 1035 - // (none) - - // Amplitude(s) for diagram number 1035 - VVV1_0( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1035 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - - // *** DIAGRAM 1036 OF 1240 *** - - // Wavefunction(s) for diagram number 1036 - // (none) - - // Amplitude(s) for diagram number 1036 - VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 1037 OF 1240 *** - - // Wavefunction(s) for diagram number 1037 - // (none) - - // Amplitude(s) for diagram number 1037 - VVV1_0( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1037 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1038 OF 1240 *** - - // Wavefunction(s) for diagram number 1038 - // (none) - - // Amplitude(s) for diagram number 1038 - VVV1_0( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1038 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 1039 OF 1240 *** - - // Wavefunction(s) for diagram number 1039 - // (none) - - // Amplitude(s) for diagram number 1039 - VVV1_0( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVV1_0( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVV1_0( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1040 OF 1240 *** - - // Wavefunction(s) for diagram number 1040 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 1040 - VVV1_0( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1041 OF 1240 *** - - // Wavefunction(s) for diagram number 1041 - // (none) - - // Amplitude(s) for diagram number 1041 - VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 1042 OF 1240 *** - - // Wavefunction(s) for diagram number 1042 - // (none) - - // Amplitude(s) for diagram number 1042 - VVV1_0( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1043 OF 1240 *** - - // Wavefunction(s) for diagram number 1043 - // (none) - - // Amplitude(s) for diagram number 1043 - VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1044 OF 1240 *** - - // Wavefunction(s) for diagram number 1044 - // (none) - - // Amplitude(s) for diagram number 1044 - VVV1_0( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1045 OF 1240 *** - - // Wavefunction(s) for diagram number 1045 - // (none) - - // Amplitude(s) for diagram number 1045 - VVV1_0( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1046 OF 1240 *** - - // Wavefunction(s) for diagram number 1046 - // (none) - - // Amplitude(s) for diagram number 1046 - FFV1_0( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1046 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[52] -= amp_sv[0]; - - // *** DIAGRAM 1047 OF 1240 *** - - // Wavefunction(s) for diagram number 1047 - // (none) - - // Amplitude(s) for diagram number 1047 - FFV1_0( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1047 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[49] -= amp_sv[0]; - - // *** DIAGRAM 1048 OF 1240 *** - - // Wavefunction(s) for diagram number 1048 - // (none) - - // Amplitude(s) for diagram number 1048 - FFV1_0( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1048 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[58] -= amp_sv[0]; - - // *** DIAGRAM 1049 OF 1240 *** - - // Wavefunction(s) for diagram number 1049 - // (none) - - // Amplitude(s) for diagram number 1049 - FFV1_0( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1049 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[68] -= amp_sv[0]; - - // *** DIAGRAM 1050 OF 1240 *** - - // Wavefunction(s) for diagram number 1050 - // (none) - - // Amplitude(s) for diagram number 1050 - FFV1_0( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1050 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[55] -= amp_sv[0]; - - // *** DIAGRAM 1051 OF 1240 *** - - // Wavefunction(s) for diagram number 1051 - // (none) - - // Amplitude(s) for diagram number 1051 - FFV1_0( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1051 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] -= amp_sv[0]; - - // *** DIAGRAM 1052 OF 1240 *** - - // Wavefunction(s) for diagram number 1052 - // (none) - - // Amplitude(s) for diagram number 1052 - FFV1_0( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1052 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[50] -= amp_sv[0]; - - // *** DIAGRAM 1053 OF 1240 *** - - // Wavefunction(s) for diagram number 1053 - // (none) - - // Amplitude(s) for diagram number 1053 - FFV1_0( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1053 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] -= amp_sv[0]; - - // *** DIAGRAM 1054 OF 1240 *** - - // Wavefunction(s) for diagram number 1054 - // (none) - - // Amplitude(s) for diagram number 1054 - FFV1_0( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1054 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[56] -= amp_sv[0]; - - // *** DIAGRAM 1055 OF 1240 *** - - // Wavefunction(s) for diagram number 1055 - // (none) - - // Amplitude(s) for diagram number 1055 - FFV1_0( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1055 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[62] -= amp_sv[0]; - - // *** DIAGRAM 1056 OF 1240 *** - - // Wavefunction(s) for diagram number 1056 - // (none) - - // Amplitude(s) for diagram number 1056 - FFV1_0( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1056 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[54] -= amp_sv[0]; - - // *** DIAGRAM 1057 OF 1240 *** - - // Wavefunction(s) for diagram number 1057 - // (none) - - // Amplitude(s) for diagram number 1057 - FFV1_0( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1057 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] -= amp_sv[0]; - - // *** DIAGRAM 1058 OF 1240 *** - - // Wavefunction(s) for diagram number 1058 - // (none) - - // Amplitude(s) for diagram number 1058 - FFV1_0( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1058 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - - // *** DIAGRAM 1059 OF 1240 *** - - // Wavefunction(s) for diagram number 1059 - // (none) - - // Amplitude(s) for diagram number 1059 - FFV1_0( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1059 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1060 OF 1240 *** - - // Wavefunction(s) for diagram number 1060 - // (none) - - // Amplitude(s) for diagram number 1060 - FFV1_0( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1060 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - - // *** DIAGRAM 1061 OF 1240 *** - - // Wavefunction(s) for diagram number 1061 - // (none) - - // Amplitude(s) for diagram number 1061 - VVV1_0( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1061 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1062 OF 1240 *** - - // Wavefunction(s) for diagram number 1062 - // (none) - - // Amplitude(s) for diagram number 1062 - FFV1_0( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1062 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1063 OF 1240 *** - - // Wavefunction(s) for diagram number 1063 - // (none) - - // Amplitude(s) for diagram number 1063 - VVV1_0( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1063 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1064 OF 1240 *** - - // Wavefunction(s) for diagram number 1064 - // (none) - - // Amplitude(s) for diagram number 1064 - FFV1_0( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1065 OF 1240 *** - - // Wavefunction(s) for diagram number 1065 - // (none) - - // Amplitude(s) for diagram number 1065 - FFV1_0( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1065 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[76] -= amp_sv[0]; - - // *** DIAGRAM 1066 OF 1240 *** - - // Wavefunction(s) for diagram number 1066 - // (none) - - // Amplitude(s) for diagram number 1066 - FFV1_0( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1066 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[73] -= amp_sv[0]; - - // *** DIAGRAM 1067 OF 1240 *** - - // Wavefunction(s) for diagram number 1067 - // (none) - - // Amplitude(s) for diagram number 1067 - FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1067 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[82] -= amp_sv[0]; - - // *** DIAGRAM 1068 OF 1240 *** - - // Wavefunction(s) for diagram number 1068 - // (none) - - // Amplitude(s) for diagram number 1068 - FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1068 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[92] -= amp_sv[0]; - - // *** DIAGRAM 1069 OF 1240 *** - - // Wavefunction(s) for diagram number 1069 - // (none) - - // Amplitude(s) for diagram number 1069 - FFV1_0( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1069 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[79] -= amp_sv[0]; - - // *** DIAGRAM 1070 OF 1240 *** - - // Wavefunction(s) for diagram number 1070 - // (none) - - // Amplitude(s) for diagram number 1070 - FFV1_0( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1070 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] -= amp_sv[0]; - - // *** DIAGRAM 1071 OF 1240 *** - - // Wavefunction(s) for diagram number 1071 - // (none) - - // Amplitude(s) for diagram number 1071 - FFV1_0( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1071 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[74] -= amp_sv[0]; - - // *** DIAGRAM 1072 OF 1240 *** - - // Wavefunction(s) for diagram number 1072 - // (none) - - // Amplitude(s) for diagram number 1072 - FFV1_0( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1072 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] -= amp_sv[0]; - - // *** DIAGRAM 1073 OF 1240 *** - - // Wavefunction(s) for diagram number 1073 - // (none) - - // Amplitude(s) for diagram number 1073 - FFV1_0( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1073 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[80] -= amp_sv[0]; - - // *** DIAGRAM 1074 OF 1240 *** - - // Wavefunction(s) for diagram number 1074 - // (none) - - // Amplitude(s) for diagram number 1074 - FFV1_0( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1074 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[86] -= amp_sv[0]; - - // *** DIAGRAM 1075 OF 1240 *** - - // Wavefunction(s) for diagram number 1075 - // (none) - - // Amplitude(s) for diagram number 1075 - FFV1_0( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1075 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[78] -= amp_sv[0]; - - // *** DIAGRAM 1076 OF 1240 *** - - // Wavefunction(s) for diagram number 1076 - // (none) - - // Amplitude(s) for diagram number 1076 - FFV1_0( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1076 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[84] -= amp_sv[0]; - - // *** DIAGRAM 1077 OF 1240 *** - - // Wavefunction(s) for diagram number 1077 - // (none) - - // Amplitude(s) for diagram number 1077 - FFV1_0( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1077 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - - // *** DIAGRAM 1078 OF 1240 *** - - // Wavefunction(s) for diagram number 1078 - // (none) - - // Amplitude(s) for diagram number 1078 - FFV1_0( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1078 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1079 OF 1240 *** - - // Wavefunction(s) for diagram number 1079 - // (none) - - // Amplitude(s) for diagram number 1079 - FFV1_0( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1079 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - - // *** DIAGRAM 1080 OF 1240 *** - - // Wavefunction(s) for diagram number 1080 - // (none) - - // Amplitude(s) for diagram number 1080 - VVV1_0( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1080 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1081 OF 1240 *** - - // Wavefunction(s) for diagram number 1081 - // (none) - - // Amplitude(s) for diagram number 1081 - FFV1_0( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1081 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1082 OF 1240 *** - - // Wavefunction(s) for diagram number 1082 - // (none) - - // Amplitude(s) for diagram number 1082 - VVV1_0( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1082 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1083 OF 1240 *** - - // Wavefunction(s) for diagram number 1083 - // (none) - - // Amplitude(s) for diagram number 1083 - FFV1_0( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1084 OF 1240 *** - - // Wavefunction(s) for diagram number 1084 - // (none) - - // Amplitude(s) for diagram number 1084 - FFV1_0( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1084 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[100] -= amp_sv[0]; - - // *** DIAGRAM 1085 OF 1240 *** - - // Wavefunction(s) for diagram number 1085 - // (none) - - // Amplitude(s) for diagram number 1085 - FFV1_0( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1085 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[97] -= amp_sv[0]; - - // *** DIAGRAM 1086 OF 1240 *** - - // Wavefunction(s) for diagram number 1086 - // (none) - - // Amplitude(s) for diagram number 1086 - FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1086 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[106] -= amp_sv[0]; - - // *** DIAGRAM 1087 OF 1240 *** - - // Wavefunction(s) for diagram number 1087 - // (none) - - // Amplitude(s) for diagram number 1087 - FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1087 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 1088 OF 1240 *** - - // Wavefunction(s) for diagram number 1088 - // (none) - - // Amplitude(s) for diagram number 1088 - FFV1_0( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1088 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 1089 OF 1240 *** - - // Wavefunction(s) for diagram number 1089 - // (none) - - // Amplitude(s) for diagram number 1089 - FFV1_0( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1089 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 1090 OF 1240 *** - - // Wavefunction(s) for diagram number 1090 - // (none) - - // Amplitude(s) for diagram number 1090 - FFV1_0( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1090 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[98] -= amp_sv[0]; - - // *** DIAGRAM 1091 OF 1240 *** - - // Wavefunction(s) for diagram number 1091 - // (none) - - // Amplitude(s) for diagram number 1091 - FFV1_0( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1091 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] -= amp_sv[0]; - - // *** DIAGRAM 1092 OF 1240 *** - - // Wavefunction(s) for diagram number 1092 - // (none) - - // Amplitude(s) for diagram number 1092 - FFV1_0( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1092 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[104] -= amp_sv[0]; - - // *** DIAGRAM 1093 OF 1240 *** - - // Wavefunction(s) for diagram number 1093 - // (none) - - // Amplitude(s) for diagram number 1093 - FFV1_0( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1093 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 1094 OF 1240 *** - - // Wavefunction(s) for diagram number 1094 - // (none) - - // Amplitude(s) for diagram number 1094 - FFV1_0( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1094 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 1095 OF 1240 *** - - // Wavefunction(s) for diagram number 1095 - // (none) - - // Amplitude(s) for diagram number 1095 - FFV1_0( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1095 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 1096 OF 1240 *** - - // Wavefunction(s) for diagram number 1096 - // (none) - - // Amplitude(s) for diagram number 1096 - FFV1_0( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1096 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - - // *** DIAGRAM 1097 OF 1240 *** - - // Wavefunction(s) for diagram number 1097 - // (none) - - // Amplitude(s) for diagram number 1097 - FFV1_0( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1097 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1098 OF 1240 *** - - // Wavefunction(s) for diagram number 1098 - // (none) - - // Amplitude(s) for diagram number 1098 - FFV1_0( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1098 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 1099 OF 1240 *** - - // Wavefunction(s) for diagram number 1099 - // (none) - - // Amplitude(s) for diagram number 1099 - VVV1_0( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1099 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1100 OF 1240 *** - - // Wavefunction(s) for diagram number 1100 - // (none) - - // Amplitude(s) for diagram number 1100 - FFV1_0( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1100 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1101 OF 1240 *** - - // Wavefunction(s) for diagram number 1101 - // (none) - - // Amplitude(s) for diagram number 1101 - VVV1_0( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1102 OF 1240 *** - - // Wavefunction(s) for diagram number 1102 - // (none) - - // Amplitude(s) for diagram number 1102 - FFV1_0( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1103 OF 1240 *** - - // Wavefunction(s) for diagram number 1103 - // (none) - - // Amplitude(s) for diagram number 1103 - FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1104 OF 1240 *** - - // Wavefunction(s) for diagram number 1104 - // (none) - - // Amplitude(s) for diagram number 1104 - FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1105 OF 1240 *** - - // Wavefunction(s) for diagram number 1105 - // (none) - - // Amplitude(s) for diagram number 1105 - FFV1_0( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - - // *** DIAGRAM 1106 OF 1240 *** - - // Wavefunction(s) for diagram number 1106 - // (none) - - // Amplitude(s) for diagram number 1106 - VVV1_0( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1107 OF 1240 *** - - // Wavefunction(s) for diagram number 1107 - // (none) - - // Amplitude(s) for diagram number 1107 - FFV1_0( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1107 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1108 OF 1240 *** - - // Wavefunction(s) for diagram number 1108 - // (none) - - // Amplitude(s) for diagram number 1108 - VVV1_0( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1109 OF 1240 *** - - // Wavefunction(s) for diagram number 1109 - // (none) - - // Amplitude(s) for diagram number 1109 - FFV1_0( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1110 OF 1240 *** - - // Wavefunction(s) for diagram number 1110 - // (none) - - // Amplitude(s) for diagram number 1110 - FFV1_0( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1111 OF 1240 *** - - // Wavefunction(s) for diagram number 1111 - // (none) - - // Amplitude(s) for diagram number 1111 - FFV1_0( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1112 OF 1240 *** - - // Wavefunction(s) for diagram number 1112 - // (none) - - // Amplitude(s) for diagram number 1112 - FFV1_0( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - - // *** DIAGRAM 1113 OF 1240 *** - - // Wavefunction(s) for diagram number 1113 - // (none) - - // Amplitude(s) for diagram number 1113 - VVV1_0( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1114 OF 1240 *** - - // Wavefunction(s) for diagram number 1114 - // (none) - - // Amplitude(s) for diagram number 1114 - FFV1_0( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1114 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1115 OF 1240 *** - - // Wavefunction(s) for diagram number 1115 - // (none) - - // Amplitude(s) for diagram number 1115 - VVV1_0( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1115 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1116 OF 1240 *** - - // Wavefunction(s) for diagram number 1116 - // (none) - - // Amplitude(s) for diagram number 1116 - FFV1_0( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1117 OF 1240 *** - - // Wavefunction(s) for diagram number 1117 - // (none) - - // Amplitude(s) for diagram number 1117 - FFV1_0( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1117 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 1118 OF 1240 *** - - // Wavefunction(s) for diagram number 1118 - // (none) - - // Amplitude(s) for diagram number 1118 - FFV1_0( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1118 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1119 OF 1240 *** - - // Wavefunction(s) for diagram number 1119 - // (none) - - // Amplitude(s) for diagram number 1119 - FFV1_0( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1119 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - - // *** DIAGRAM 1120 OF 1240 *** - - // Wavefunction(s) for diagram number 1120 - // (none) - - // Amplitude(s) for diagram number 1120 - VVV1_0( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1120 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1121 OF 1240 *** - - // Wavefunction(s) for diagram number 1121 - // (none) - - // Amplitude(s) for diagram number 1121 - FFV1_0( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1121 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1122 OF 1240 *** - - // Wavefunction(s) for diagram number 1122 - // (none) - - // Amplitude(s) for diagram number 1122 - VVV1_0( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1122 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1123 OF 1240 *** - - // Wavefunction(s) for diagram number 1123 - // (none) - - // Amplitude(s) for diagram number 1123 - FFV1_0( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1124 OF 1240 *** - - // Wavefunction(s) for diagram number 1124 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] ); - - // Amplitude(s) for diagram number 1124 - VVVV1_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVVV1_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV3_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV4_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVVV1_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV3_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1125 OF 1240 *** - - // Wavefunction(s) for diagram number 1125 - VVV1P0_1( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] ); - VVV1P0_1( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); - VVV1P0_1( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] ); - - // Amplitude(s) for diagram number 1125 - VVV1_0( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1126 OF 1240 *** - - // Wavefunction(s) for diagram number 1126 - VVV1P0_1( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] ); - VVV1P0_1( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] ); - VVV1P0_1( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 1126 - VVV1_0( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1127 OF 1240 *** - - // Wavefunction(s) for diagram number 1127 - // (none) - - // Amplitude(s) for diagram number 1127 - VVV1_0( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1128 OF 1240 *** - - // Wavefunction(s) for diagram number 1128 - FFV1_2( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_2( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); - FFV1_2( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); - - // Amplitude(s) for diagram number 1128 - FFV1_0( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - FFV1_0( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - FFV1_0( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - - // *** DIAGRAM 1129 OF 1240 *** - - // Wavefunction(s) for diagram number 1129 - // (none) - - // Amplitude(s) for diagram number 1129 - FFV1_0( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1130 OF 1240 *** - - // Wavefunction(s) for diagram number 1130 - // (none) - - // Amplitude(s) for diagram number 1130 - FFV1_0( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[74] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - - // *** DIAGRAM 1131 OF 1240 *** - - // Wavefunction(s) for diagram number 1131 - // (none) - - // Amplitude(s) for diagram number 1131 - FFV1_0( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - FFV1_0( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - FFV1_0( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1132 OF 1240 *** - - // Wavefunction(s) for diagram number 1132 - // (none) - - // Amplitude(s) for diagram number 1132 - FFV1_0( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1133 OF 1240 *** - - // Wavefunction(s) for diagram number 1133 - // (none) - - // Amplitude(s) for diagram number 1133 - FFV1_0( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[98] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 1134 OF 1240 *** - - // Wavefunction(s) for diagram number 1134 - FFV1_1( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - FFV1_1( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - FFV1_1( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); - - // Amplitude(s) for diagram number 1134 - FFV1_0( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - - // *** DIAGRAM 1135 OF 1240 *** - - // Wavefunction(s) for diagram number 1135 - // (none) - - // Amplitude(s) for diagram number 1135 - FFV1_0( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1136 OF 1240 *** - - // Wavefunction(s) for diagram number 1136 - // (none) - - // Amplitude(s) for diagram number 1136 - FFV1_0( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - - // *** DIAGRAM 1137 OF 1240 *** - - // Wavefunction(s) for diagram number 1137 - // (none) - - // Amplitude(s) for diagram number 1137 - FFV1_0( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1138 OF 1240 *** - - // Wavefunction(s) for diagram number 1138 - // (none) - - // Amplitude(s) for diagram number 1138 - FFV1_0( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1139 OF 1240 *** - - // Wavefunction(s) for diagram number 1139 - // (none) - - // Amplitude(s) for diagram number 1139 - FFV1_0( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1140 OF 1240 *** - - // Wavefunction(s) for diagram number 1140 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 1140 - VVVV1_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV3_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVVV4_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVVV1_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVVV3_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV4_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV1_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - VVVV3_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV4_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1141 OF 1240 *** - - // Wavefunction(s) for diagram number 1141 - VVV1P0_1( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] ); - VVV1P0_1( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] ); - VVV1P0_1( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 1141 - VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1142 OF 1240 *** - - // Wavefunction(s) for diagram number 1142 - VVV1P0_1( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] ); - VVV1P0_1( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] ); - VVV1P0_1( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] ); - - // Amplitude(s) for diagram number 1142 - VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1143 OF 1240 *** - - // Wavefunction(s) for diagram number 1143 - // (none) - - // Amplitude(s) for diagram number 1143 - VVV1_0( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVV1_0( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVV1_0( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 1144 OF 1240 *** - - // Wavefunction(s) for diagram number 1144 - FFV1_2( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); - FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] ); - FFV1_2( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - - // Amplitude(s) for diagram number 1144 - FFV1_0( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - FFV1_0( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - FFV1_0( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - - // *** DIAGRAM 1145 OF 1240 *** - - // Wavefunction(s) for diagram number 1145 - // (none) - - // Amplitude(s) for diagram number 1145 - FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1146 OF 1240 *** - - // Wavefunction(s) for diagram number 1146 - // (none) - - // Amplitude(s) for diagram number 1146 - FFV1_0( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[50] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - - // *** DIAGRAM 1147 OF 1240 *** - - // Wavefunction(s) for diagram number 1147 - // (none) - - // Amplitude(s) for diagram number 1147 - FFV1_0( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - FFV1_0( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - FFV1_0( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 1148 OF 1240 *** - - // Wavefunction(s) for diagram number 1148 - // (none) - - // Amplitude(s) for diagram number 1148 - FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1149 OF 1240 *** - - // Wavefunction(s) for diagram number 1149 - // (none) - - // Amplitude(s) for diagram number 1149 - FFV1_0( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[100] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 1150 OF 1240 *** - - // Wavefunction(s) for diagram number 1150 - FFV1_1( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); - FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); - FFV1_1( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] ); - - // Amplitude(s) for diagram number 1150 - FFV1_0( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - - // *** DIAGRAM 1151 OF 1240 *** - - // Wavefunction(s) for diagram number 1151 - // (none) - - // Amplitude(s) for diagram number 1151 - FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1152 OF 1240 *** - - // Wavefunction(s) for diagram number 1152 - // (none) - - // Amplitude(s) for diagram number 1152 - FFV1_0( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - - // *** DIAGRAM 1153 OF 1240 *** - - // Wavefunction(s) for diagram number 1153 - // (none) - - // Amplitude(s) for diagram number 1153 - FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1154 OF 1240 *** - - // Wavefunction(s) for diagram number 1154 - // (none) - - // Amplitude(s) for diagram number 1154 - FFV1_0( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1155 OF 1240 *** - - // Wavefunction(s) for diagram number 1155 - // (none) - - // Amplitude(s) for diagram number 1155 - FFV1_0( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1156 OF 1240 *** - - // Wavefunction(s) for diagram number 1156 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 1156 - VVVV1_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVVV3_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - VVVV4_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - VVVV1_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - VVVV3_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - VVVV4_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - VVVV1_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - VVVV3_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - VVVV4_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 1157 OF 1240 *** - - // Wavefunction(s) for diagram number 1157 - VVV1P0_1( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] ); - VVV1P0_1( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] ); - VVV1P0_1( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] ); - - // Amplitude(s) for diagram number 1157 - VVV1_0( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 1158 OF 1240 *** - - // Wavefunction(s) for diagram number 1158 - VVV1P0_1( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] ); - VVV1P0_1( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] ); - VVV1P0_1( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] ); - - // Amplitude(s) for diagram number 1158 - VVV1_0( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 1159 OF 1240 *** - - // Wavefunction(s) for diagram number 1159 - // (none) - - // Amplitude(s) for diagram number 1159 - VVV1_0( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVV1_0( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - VVV1_0( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 1160 OF 1240 *** - - // Wavefunction(s) for diagram number 1160 - FFV1_2( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_2( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - FFV1_2( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); - - // Amplitude(s) for diagram number 1160 - FFV1_0( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - FFV1_0( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - FFV1_0( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - - // *** DIAGRAM 1161 OF 1240 *** - - // Wavefunction(s) for diagram number 1161 - // (none) - - // Amplitude(s) for diagram number 1161 - FFV1_0( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1162 OF 1240 *** - - // Wavefunction(s) for diagram number 1162 - // (none) - - // Amplitude(s) for diagram number 1162 - FFV1_0( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[52] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - - // *** DIAGRAM 1163 OF 1240 *** - - // Wavefunction(s) for diagram number 1163 - // (none) - - // Amplitude(s) for diagram number 1163 - FFV1_0( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - FFV1_0( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - FFV1_0( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - - // *** DIAGRAM 1164 OF 1240 *** - - // Wavefunction(s) for diagram number 1164 - // (none) - - // Amplitude(s) for diagram number 1164 - FFV1_0( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1165 OF 1240 *** - - // Wavefunction(s) for diagram number 1165 - // (none) - - // Amplitude(s) for diagram number 1165 - FFV1_0( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[76] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - - // *** DIAGRAM 1166 OF 1240 *** - - // Wavefunction(s) for diagram number 1166 - FFV1_1( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - FFV1_1( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - FFV1_1( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); - - // Amplitude(s) for diagram number 1166 - FFV1_0( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 1167 OF 1240 *** - - // Wavefunction(s) for diagram number 1167 - // (none) - - // Amplitude(s) for diagram number 1167 - FFV1_0( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1168 OF 1240 *** - - // Wavefunction(s) for diagram number 1168 - // (none) - - // Amplitude(s) for diagram number 1168 - FFV1_0( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 1169 OF 1240 *** - - // Wavefunction(s) for diagram number 1169 - // (none) - - // Amplitude(s) for diagram number 1169 - FFV1_0( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1170 OF 1240 *** - - // Wavefunction(s) for diagram number 1170 - // (none) - - // Amplitude(s) for diagram number 1170 - FFV1_0( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1171 OF 1240 *** - - // Wavefunction(s) for diagram number 1171 - // (none) - - // Amplitude(s) for diagram number 1171 - FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1172 OF 1240 *** - - // Wavefunction(s) for diagram number 1172 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] ); - FFV1_2( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_2( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); - FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - - // Amplitude(s) for diagram number 1172 - FFV1_0( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - FFV1_0( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[43] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - - // *** DIAGRAM 1173 OF 1240 *** - - // Wavefunction(s) for diagram number 1173 - VVV1P0_1( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] ); - VVV1P0_1( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] ); - VVV1P0_1( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] ); - - // Amplitude(s) for diagram number 1173 - FFV1_0( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1174 OF 1240 *** - - // Wavefunction(s) for diagram number 1174 - // (none) - - // Amplitude(s) for diagram number 1174 - FFV1_0( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - - // *** DIAGRAM 1175 OF 1240 *** - - // Wavefunction(s) for diagram number 1175 - FFV1_1( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); - FFV1_1( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); - FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 1175 - FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[15] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - - // *** DIAGRAM 1176 OF 1240 *** - - // Wavefunction(s) for diagram number 1176 - // (none) - - // Amplitude(s) for diagram number 1176 - FFV1_0( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1177 OF 1240 *** - - // Wavefunction(s) for diagram number 1177 - // (none) - - // Amplitude(s) for diagram number 1177 - FFV1_0( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1178 OF 1240 *** - - // Wavefunction(s) for diagram number 1178 - // (none) - - // Amplitude(s) for diagram number 1178 - FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1179 OF 1240 *** - - // Wavefunction(s) for diagram number 1179 - // (none) - - // Amplitude(s) for diagram number 1179 - FFV1_0( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1180 OF 1240 *** - - // Wavefunction(s) for diagram number 1180 - // (none) - - // Amplitude(s) for diagram number 1180 - VVV1_0( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVV1_0( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVV1_0( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 1181 OF 1240 *** - - // Wavefunction(s) for diagram number 1181 - // (none) - - // Amplitude(s) for diagram number 1181 - VVVV1_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVVV3_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - VVVV4_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV1_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[14] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVVV3_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - VVVV4_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV1_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV3_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1182 OF 1240 *** - - // Wavefunction(s) for diagram number 1182 - VVV1P0_1( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] ); - VVV1P0_1( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] ); - VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 1182 - VVV1_0( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[14] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 1183 OF 1240 *** - - // Wavefunction(s) for diagram number 1183 - // (none) - - // Amplitude(s) for diagram number 1183 - VVV1_0( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1184 OF 1240 *** - - // Wavefunction(s) for diagram number 1184 - // (none) - - // Amplitude(s) for diagram number 1184 - FFV1_0( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1185 OF 1240 *** - - // Wavefunction(s) for diagram number 1185 - // (none) - - // Amplitude(s) for diagram number 1185 - FFV1_0( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - FFV1_0( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 1186 OF 1240 *** - - // Wavefunction(s) for diagram number 1186 - // (none) - - // Amplitude(s) for diagram number 1186 - FFV1_0( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1187 OF 1240 *** - - // Wavefunction(s) for diagram number 1187 - // (none) - - // Amplitude(s) for diagram number 1187 - FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[14] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[8] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - - // *** DIAGRAM 1188 OF 1240 *** - - // Wavefunction(s) for diagram number 1188 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] ); - FFV1_2( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); - FFV1_2( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); - FFV1_2( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] ); - - // Amplitude(s) for diagram number 1188 - FFV1_0( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - FFV1_0( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[37] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - FFV1_0( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - - // *** DIAGRAM 1189 OF 1240 *** - - // Wavefunction(s) for diagram number 1189 - VVV1P0_1( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] ); - VVV1P0_1( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] ); - VVV1P0_1( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] ); - - // Amplitude(s) for diagram number 1189 - FFV1_0( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1190 OF 1240 *** - - // Wavefunction(s) for diagram number 1190 - // (none) - - // Amplitude(s) for diagram number 1190 - FFV1_0( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - - // *** DIAGRAM 1191 OF 1240 *** - - // Wavefunction(s) for diagram number 1191 - FFV1_1( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] ); - FFV1_1( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); - FFV1_1( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 1191 - FFV1_0( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[21] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - - // *** DIAGRAM 1192 OF 1240 *** - - // Wavefunction(s) for diagram number 1192 - // (none) - - // Amplitude(s) for diagram number 1192 - FFV1_0( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1193 OF 1240 *** - - // Wavefunction(s) for diagram number 1193 - // (none) - - // Amplitude(s) for diagram number 1193 - FFV1_0( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - - // *** DIAGRAM 1194 OF 1240 *** - - // Wavefunction(s) for diagram number 1194 - // (none) - - // Amplitude(s) for diagram number 1194 - FFV1_0( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1195 OF 1240 *** - - // Wavefunction(s) for diagram number 1195 - // (none) - - // Amplitude(s) for diagram number 1195 - FFV1_0( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1196 OF 1240 *** - - // Wavefunction(s) for diagram number 1196 - // (none) - - // Amplitude(s) for diagram number 1196 - VVV1_0( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVV1_0( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - VVV1_0( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - - // *** DIAGRAM 1197 OF 1240 *** - - // Wavefunction(s) for diagram number 1197 - // (none) - - // Amplitude(s) for diagram number 1197 - VVVV1_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - VVVV3_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - VVVV4_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[11] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - VVVV1_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[20] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVVV3_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVVV4_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVVV1_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVVV3_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV4_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 1198 OF 1240 *** - - // Wavefunction(s) for diagram number 1198 - VVV1P0_1( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] ); - VVV1P0_1( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); - VVV1P0_1( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] ); - - // Amplitude(s) for diagram number 1198 - VVV1_0( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[20] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 1199 OF 1240 *** - - // Wavefunction(s) for diagram number 1199 - // (none) - - // Amplitude(s) for diagram number 1199 - VVV1_0( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[11] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 1200 OF 1240 *** - - // Wavefunction(s) for diagram number 1200 - // (none) - - // Amplitude(s) for diagram number 1200 - FFV1_0( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1201 OF 1240 *** - - // Wavefunction(s) for diagram number 1201 - // (none) - - // Amplitude(s) for diagram number 1201 - FFV1_0( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - FFV1_0( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - FFV1_0( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - - // *** DIAGRAM 1202 OF 1240 *** - - // Wavefunction(s) for diagram number 1202 - // (none) - - // Amplitude(s) for diagram number 1202 - FFV1_0( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1203 OF 1240 *** - - // Wavefunction(s) for diagram number 1203 - // (none) - - // Amplitude(s) for diagram number 1203 - FFV1_0( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[20] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[10] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 1204 OF 1240 *** - - // Wavefunction(s) for diagram number 1204 - VVVV1P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] ); - VVVV3P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] ); - VVVV4P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] ); - FFV1_2( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); - FFV1_2( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] ); - - // Amplitude(s) for diagram number 1204 - FFV1_0( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - FFV1_0( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[31] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - FFV1_0( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - - // *** DIAGRAM 1205 OF 1240 *** - - // Wavefunction(s) for diagram number 1205 - VVV1P0_1( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] ); - VVV1P0_1( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] ); - VVV1P0_1( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 1205 - FFV1_0( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1206 OF 1240 *** - - // Wavefunction(s) for diagram number 1206 - // (none) - - // Amplitude(s) for diagram number 1206 - FFV1_0( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[29] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[27] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - - // *** DIAGRAM 1207 OF 1240 *** - - // Wavefunction(s) for diagram number 1207 - FFV1_1( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); - FFV1_1( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); - - // Amplitude(s) for diagram number 1207 - FFV1_0( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[23] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[17] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 1208 OF 1240 *** - - // Wavefunction(s) for diagram number 1208 - // (none) - - // Amplitude(s) for diagram number 1208 - FFV1_0( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1209 OF 1240 *** - - // Wavefunction(s) for diagram number 1209 - // (none) - - // Amplitude(s) for diagram number 1209 - FFV1_0( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - - // *** DIAGRAM 1210 OF 1240 *** - - // Wavefunction(s) for diagram number 1210 - // (none) - - // Amplitude(s) for diagram number 1210 - FFV1_0( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1211 OF 1240 *** - - // Wavefunction(s) for diagram number 1211 - // (none) - - // Amplitude(s) for diagram number 1211 - FFV1_0( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1212 OF 1240 *** - - // Wavefunction(s) for diagram number 1212 - // (none) - - // Amplitude(s) for diagram number 1212 - VVV1_0( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 1213 OF 1240 *** - - // Wavefunction(s) for diagram number 1213 - // (none) - - // Amplitude(s) for diagram number 1213 - VVVV1_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - VVVV3_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVVV4_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVVV1_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[22] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVVV3_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV4_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV1_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV3_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV4_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 1214 OF 1240 *** - - // Wavefunction(s) for diagram number 1214 - VVV1P0_1( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] ); - VVV1P0_1( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] ); - VVV1P0_1( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] ); - - // Amplitude(s) for diagram number 1214 - VVV1_0( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[22] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 1215 OF 1240 *** - - // Wavefunction(s) for diagram number 1215 - // (none) - - // Amplitude(s) for diagram number 1215 - VVV1_0( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 1216 OF 1240 *** - - // Wavefunction(s) for diagram number 1216 - // (none) - - // Amplitude(s) for diagram number 1216 - FFV1_0( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1217 OF 1240 *** - - // Wavefunction(s) for diagram number 1217 - // (none) - - // Amplitude(s) for diagram number 1217 - FFV1_0( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - FFV1_0( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - FFV1_0( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - - // *** DIAGRAM 1218 OF 1240 *** - - // Wavefunction(s) for diagram number 1218 - // (none) - - // Amplitude(s) for diagram number 1218 - FFV1_0( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1219 OF 1240 *** - - // Wavefunction(s) for diagram number 1219 - // (none) - - // Amplitude(s) for diagram number 1219 - FFV1_0( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[22] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[16] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 1220 OF 1240 *** - - // Wavefunction(s) for diagram number 1220 - // (none) - - // Amplitude(s) for diagram number 1220 - VVVV1_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 1221 OF 1240 *** - - // Wavefunction(s) for diagram number 1221 - VVV1P0_1( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] ); - VVV1P0_1( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] ); - VVV1P0_1( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] ); - - // Amplitude(s) for diagram number 1221 - VVV1_0( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1222 OF 1240 *** - - // Wavefunction(s) for diagram number 1222 - // (none) - - // Amplitude(s) for diagram number 1222 - VVV1_0( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVV1_0( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1223 OF 1240 *** - - // Wavefunction(s) for diagram number 1223 - // (none) - - // Amplitude(s) for diagram number 1223 - FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1224 OF 1240 *** - - // Wavefunction(s) for diagram number 1224 - // (none) - - // Amplitude(s) for diagram number 1224 - FFV1_0( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 1225 OF 1240 *** - - // Wavefunction(s) for diagram number 1225 - // (none) - - // Amplitude(s) for diagram number 1225 - FFV1_0( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1226 OF 1240 *** - - // Wavefunction(s) for diagram number 1226 - // (none) - - // Amplitude(s) for diagram number 1226 - FFV1_0( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[38] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[32] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - - // *** DIAGRAM 1227 OF 1240 *** - - // Wavefunction(s) for diagram number 1227 - // (none) - - // Amplitude(s) for diagram number 1227 - VVVV1_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 1228 OF 1240 *** - - // Wavefunction(s) for diagram number 1228 - VVV1P0_1( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] ); - VVV1P0_1( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] ); - VVV1P0_1( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] ); - - // Amplitude(s) for diagram number 1228 - VVV1_0( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1229 OF 1240 *** - - // Wavefunction(s) for diagram number 1229 - // (none) - - // Amplitude(s) for diagram number 1229 - VVV1_0( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVV1_0( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVV1_0( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 1230 OF 1240 *** - - // Wavefunction(s) for diagram number 1230 - // (none) - - // Amplitude(s) for diagram number 1230 - FFV1_0( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1231 OF 1240 *** - - // Wavefunction(s) for diagram number 1231 - // (none) - - // Amplitude(s) for diagram number 1231 - FFV1_0( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - - // *** DIAGRAM 1232 OF 1240 *** - - // Wavefunction(s) for diagram number 1232 - // (none) - - // Amplitude(s) for diagram number 1232 - FFV1_0( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1233 OF 1240 *** - - // Wavefunction(s) for diagram number 1233 - // (none) - - // Amplitude(s) for diagram number 1233 - FFV1_0( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - FFV1_0( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[44] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - FFV1_0( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[34] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 1234 OF 1240 *** - - // Wavefunction(s) for diagram number 1234 - // (none) - - // Amplitude(s) for diagram number 1234 - VVVV1_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 1235 OF 1240 *** - - // Wavefunction(s) for diagram number 1235 - VVV1P0_1( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] ); - VVV1P0_1( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] ); - VVV1P0_1( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] ); - - // Amplitude(s) for diagram number 1235 - VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1236 OF 1240 *** - - // Wavefunction(s) for diagram number 1236 - // (none) - - // Amplitude(s) for diagram number 1236 - VVV1_0( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVV1_0( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 1237 OF 1240 *** - - // Wavefunction(s) for diagram number 1237 - // (none) - - // Amplitude(s) for diagram number 1237 - FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1238 OF 1240 *** - - // Wavefunction(s) for diagram number 1238 - // (none) - - // Amplitude(s) for diagram number 1238 - FFV1_0( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - - // *** DIAGRAM 1239 OF 1240 *** - - // Wavefunction(s) for diagram number 1239 - // (none) - - // Amplitude(s) for diagram number 1239 - FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1240 OF 1240 *** - - // Wavefunction(s) for diagram number 1240 - // (none) - - // Amplitude(s) for diagram number 1240 - FFV1_0( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[46] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[40] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxggg()?) - - // The color denominators (initialize all array elements, with ncolor=120) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120] - - // The color matrix (initialize all array elements, with ncolor=120) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 }, - { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 }, - { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 }, - { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 }, - { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 }, - { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 }, - { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 }, - { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 }, - { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 }, - { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 }, - { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 }, - { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 }, - { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 }, - { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 }, - { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 }, - { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 }, - { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 }, - { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 }, - { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 }, - { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 }, - { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 }, - { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 }, - { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 }, - { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 }, - { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 }, - { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 }, - { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 }, - { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 }, - { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 }, - { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 }, - { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 }, - { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 }, - { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 }, - { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 }, - { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 }, - { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 }, - { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 }, - { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 }, - { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 }, - { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 }, - { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 }, - { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 }, - { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 }, - { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 }, - { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 }, - { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 }, - { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 }, - { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 }, - { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 }, - { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 }, - { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 }, - { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 }, - { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 }, - { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 }, - { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 }, - { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 }, - { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 }, - { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 }, - { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 }, - { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 }, - { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 }, - { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 }, - { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 }, - { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 }, - { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 }, - { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 }, - { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 }, - { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 }, - { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 }, - { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 }, - { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 }, - { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 }, - { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 }, - { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 }, - { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 }, - { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 }, - { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 }, - { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 }, - { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 }, - { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 }, - { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 }, - { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 }, - { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 }, - { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 }, - { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 }, - { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 }, - { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 }, - { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 }, - { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 }, - { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 }, - { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 }, - { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 }, - { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 }, - { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 }, - { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 }, - { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 }, - { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 }, - { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 }, - { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 }, - { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 }, - { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 }, - { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 }, - { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 }, - { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 }, - { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 }, - { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 }, - { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 }, - { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 }, - { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 }, - { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 }, - { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 }, - { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 }, - { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 }, - { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 }, - { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 }, - { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 }, - { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 }, - { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 }, - { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, - { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 1240 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram124, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram125, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram126, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram127, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram128, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram129, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram130, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram131, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram132, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram133, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram134, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram135, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram136, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram137, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram138, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram139, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram140, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram141, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram142, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram143, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram144, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram145, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram146, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram147, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram148, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram149, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram150, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram151, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram152, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram153, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram154, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram155, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram156, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram157, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram158, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram159, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram160, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram161, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram162, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram163, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram164, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram165, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram166, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram167, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram168, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram169, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram170, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram171, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram172, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram173, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram174, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram175, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram176, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram177, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram178, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram179, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram180, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram181, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram182, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram183, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram184, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram185, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram186, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram187, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram188, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram189, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram190, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram191, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram192, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram193, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram194, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram195, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram196, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram197, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram198, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram199, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram200, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram201, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram202, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram203, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram204, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram205, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram206, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram207, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram208, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram209, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram210, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram211, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram212, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram213, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram214, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram215, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram216, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram217, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram218, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram219, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram220, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram221, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram222, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram223, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram224, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram225, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram226, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram227, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram228, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram229, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram230, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram231, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram232, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram233, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram234, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram235, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram236, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram237, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram238, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram239, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram240, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram241, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram242, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram243, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram244, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram245, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram246, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram247, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram248, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram249, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram250, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram251, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram252, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram253, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram254, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram255, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram256, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram257, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram258, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram259, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram260, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram261, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram262, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram263, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram264, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram265, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram266, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram267, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram268, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram269, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram270, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram271, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram272, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram273, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram274, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram275, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram276, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram277, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram278, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram279, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram280, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram281, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram282, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram283, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram284, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram285, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram286, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram287, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram288, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram289, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram290, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram291, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram292, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram293, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram294, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram295, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram296, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram297, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram298, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram299, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram300, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram301, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram302, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram303, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram304, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram305, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram306, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram307, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram308, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram309, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram310, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram311, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram312, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram313, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram314, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram315, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram316, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram317, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram318, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram319, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram320, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram321, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram322, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram323, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram324, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram325, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram326, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram327, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram328, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram329, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram330, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram331, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram332, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram333, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram334, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram335, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram336, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram337, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram338, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram339, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram340, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram341, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram342, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram343, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram344, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram345, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram346, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram347, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram348, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram349, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram350, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram351, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram352, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram353, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram354, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram355, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram356, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram357, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram358, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram359, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram360, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram361, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram362, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram363, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram364, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram365, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram366, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram367, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram368, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram369, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram370, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram371, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram372, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram373, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram374, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram375, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram376, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram377, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram378, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram379, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram380, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram381, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram382, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram383, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram384, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram385, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram386, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram387, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram388, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram389, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram390, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram391, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram392, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram393, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram394, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram395, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram396, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram397, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram398, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram399, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram400, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram401, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram402, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram403, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram404, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram405, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram406, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram407, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram408, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram409, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram410, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram411, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram412, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram413, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram414, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram415, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram416, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram417, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram418, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram419, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram420, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram421, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram422, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram423, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram424, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram425, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram426, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram427, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram428, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram429, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram430, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram431, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram432, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram433, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram434, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram435, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram436, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram437, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram438, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram439, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram440, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram441, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram442, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram443, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram444, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram445, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram446, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram447, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram448, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram449, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram450, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram451, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram452, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram453, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram454, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram455, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram456, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram457, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram458, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram459, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram460, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram461, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram462, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram463, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram464, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram465, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram466, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram467, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram468, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram469, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram470, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram471, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram472, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram473, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram474, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram475, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram476, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram477, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram478, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram479, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram480, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram481, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram482, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram483, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram484, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram485, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram486, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram487, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram488, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram489, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram490, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram491, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram492, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram493, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram494, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram495, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram496, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram497, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram498, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram499, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram500, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram501, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram502, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram503, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram504, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram505, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram506, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram507, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram508, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram509, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram510, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram511, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram512, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram513, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram514, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram515, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram516, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram517, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram518, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram519, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram520, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram521, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram522, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram523, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram524, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram525, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram526, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram527, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram528, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram529, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram530, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram531, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram532, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram533, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram534, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram535, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram536, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram537, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram538, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram539, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram540, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram541, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram542, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram543, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram544, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram545, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram546, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram547, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram548, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram549, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram550, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram551, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram552, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram553, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram554, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram555, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram556, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram557, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram558, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram559, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram560, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram561, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram562, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram563, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram564, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram565, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram566, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram567, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram568, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram569, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram570, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram571, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram572, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram573, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram574, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram575, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram576, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram577, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram578, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram579, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram580, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram581, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram582, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram583, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram584, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram585, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram586, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram587, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram588, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram589, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram590, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram591, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram592, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram593, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram594, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram595, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram596, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram597, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram598, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram599, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram600, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram601, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram602, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram603, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram604, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram605, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram606, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram607, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram608, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram609, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram610, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram611, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram612, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram613, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram614, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram615, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram616, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram617, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram618, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram619, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram620, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram621, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram622, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram623, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram624, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram625, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram626, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram627, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram628, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram629, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram630, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram631, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram632, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram633, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram634, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram635, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram636, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram637, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram638, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram639, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram640, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram641, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram642, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram643, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram644, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram645, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram646, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram647, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram648, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram649, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram650, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram651, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram652, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram653, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram654, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram655, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram656, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram657, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram658, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram659, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram660, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram661, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram662, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram663, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram664, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram665, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram666, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram667, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram668, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram669, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram670, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram671, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram672, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram673, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram674, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram675, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram676, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram677, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram678, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram679, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram680, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram681, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram682, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram683, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram684, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram685, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram686, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram687, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram688, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram689, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram690, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram691, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram692, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram693, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram694, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram695, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram696, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram697, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram698, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram699, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram700, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram701, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram702, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram703, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram704, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram705, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram706, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram707, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram708, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram709, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram710, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram711, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram712, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram713, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram714, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram715, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram716, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram717, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram718, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram719, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram720, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram721, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram722, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram723, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram724, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram725, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram726, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram727, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram728, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram729, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram730, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram731, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram732, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram733, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram734, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram735, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram736, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram737, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram738, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram739, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram740, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram741, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram742, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram743, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram744, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram745, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram746, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram747, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram748, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram749, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram750, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram751, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram752, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram753, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram754, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram755, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram756, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram757, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram758, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram759, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram760, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram761, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram762, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram763, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram764, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram765, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram766, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram767, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram768, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram769, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram770, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram771, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram772, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram773, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram774, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram775, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram776, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram777, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram778, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram779, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram780, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram781, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram782, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram783, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram784, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram785, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram786, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram787, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram788, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram789, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram790, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram791, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram792, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram793, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram794, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram795, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram796, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram797, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram798, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram799, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram800, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram801, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram802, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram803, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram804, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram805, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram806, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram807, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram808, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram809, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram810, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram811, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram812, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram813, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram814, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram815, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram816, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram817, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram818, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram819, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram820, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram821, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram822, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram823, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram824, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram825, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram826, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram827, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram828, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram829, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram830, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram831, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram832, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram833, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram834, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram835, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram836, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram837, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram838, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram839, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram840, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram841, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram842, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram843, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram844, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram845, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram846, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram847, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram848, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram849, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram850, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram851, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram852, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram853, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram854, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram855, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram856, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram857, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram858, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram859, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram860, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram861, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram862, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram863, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram864, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram865, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram866, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram867, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram868, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram869, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram870, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram871, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram872, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram873, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram874, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram875, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram876, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram877, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram878, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram879, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram880, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram881, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram882, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram883, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram884, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram885, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram886, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram887, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram888, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram889, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram890, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram891, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram892, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram893, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram894, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram895, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram896, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram897, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram898, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram899, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram900, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram901, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram902, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram903, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram904, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram905, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram906, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram907, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram908, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram909, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram910, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram911, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram912, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram913, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram914, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram915, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram916, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram917, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram918, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram919, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram920, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram921, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram922, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram923, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram924, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram925, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram926, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram927, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram928, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram929, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram930, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram931, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram932, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram933, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram934, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram935, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram936, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram937, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram938, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram939, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram940, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram941, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram942, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram943, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram944, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram945, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram946, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram947, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram948, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram949, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram950, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram951, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram952, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram953, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram954, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram955, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram956, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram957, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram958, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram959, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram960, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram961, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram962, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram963, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram964, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram965, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram966, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram967, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram968, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram969, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram970, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram971, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram972, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram973, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram974, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram975, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram976, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram977, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram978, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram979, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram980, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram981, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram982, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram983, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram984, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram985, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram986, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram987, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram988, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram989, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram990, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram991, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram992, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram993, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram994, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram995, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram996, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram997, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram998, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram999, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1000, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1001, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1002, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1003, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1004, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1005, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1006, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1007, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1008, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1009, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1010, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1011, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1012, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1013, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1014, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1015, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1016, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1017, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1018, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1019, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1020, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1021, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1022, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1023, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1024, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1025, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1026, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1027, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1028, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1029, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1030, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1031, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1032, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1033, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1034, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1035, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1036, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1037, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1038, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1039, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1040, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1041, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1042, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1043, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1044, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1045, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1046, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1047, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1048, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1049, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1050, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1051, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1052, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1053, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1054, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1055, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1056, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1057, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1058, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1059, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1060, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1061, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1062, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1063, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1064, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1065, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1066, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1067, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1068, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1069, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1070, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1071, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1072, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1073, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1074, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1075, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1076, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1077, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1078, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1079, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1080, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1081, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1082, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1083, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1084, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1085, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1086, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1087, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1088, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1089, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1090, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1091, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1092, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1093, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1094, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1095, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1096, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1097, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1098, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1099, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1124, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1125, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1126, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1127, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1128, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1129, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1130, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1131, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1132, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1133, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1134, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1135, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1136, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1137, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1138, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1139, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1140, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1141, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1142, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1143, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1144, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1145, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1146, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1147, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1148, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1149, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1150, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1151, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1152, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1153, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1154, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1155, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1156, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1157, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1158, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1159, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1160, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1161, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1162, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1163, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1164, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1165, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1166, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1167, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1168, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1169, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1170, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1171, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1172, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1173, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1174, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1175, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1176, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1177, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1178, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1179, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1180, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1181, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1182, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1183, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1184, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1185, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1186, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1187, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1188, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1189, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1190, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1191, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1192, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1193, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1194, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1195, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1196, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1197, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1198, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1199, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1200, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1201, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1202, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1203, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1204, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1205, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1206, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1207, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1208, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1209, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1210, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1211, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1212, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1213, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1214, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1215, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1216, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1217, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1218, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1219, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1220, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1221, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1222, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1223, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1224, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1225, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1226, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1227, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1228, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1229, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1230, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1231, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1232, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1233, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1234, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1235, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1236, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1237, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1238, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1239, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1240, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram124( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram125( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram126( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram127( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram128( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram129( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram130( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram131( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram132( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram133( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram134( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram135( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram136( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram137( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram138( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram139( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram140( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram141( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram142( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram143( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram144( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram145( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram146( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram147( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram148( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram149( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram150( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram151( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram152( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram153( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram154( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram155( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram156( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram157( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram158( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram159( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram160( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram161( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram162( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram163( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram164( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram165( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram166( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram167( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram168( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram169( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram170( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram171( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram172( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram173( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram174( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram175( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram176( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram177( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram178( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram179( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram180( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram181( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram182( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram183( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram184( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram185( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram186( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram187( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram188( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram189( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram190( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram191( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram192( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram193( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram194( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram195( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram196( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram197( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram198( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram199( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram200( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram201( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram202( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram203( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram204( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram205( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram206( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram207( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram208( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram209( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram210( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram211( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram212( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram213( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram214( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram215( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram216( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram217( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram218( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram219( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram220( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram221( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram222( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram223( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram224( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram225( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram226( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram227( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram228( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram229( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram230( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram231( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram232( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram233( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram234( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram235( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram236( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram237( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram238( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram239( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram240( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram241( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram242( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram243( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram244( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram245( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram246( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram247( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram248( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram249( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram250( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram251( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram252( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram253( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram254( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram255( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram256( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram257( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram258( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram259( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram260( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram261( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram262( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram263( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram264( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram265( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram266( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram267( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram268( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram269( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram270( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram271( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram272( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram273( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram274( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram275( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram276( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram277( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram278( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram279( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram280( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram281( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram282( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram283( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram284( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram285( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram286( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram287( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram288( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram289( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram290( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram291( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram292( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram293( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram294( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram295( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram296( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram297( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram298( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram299( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram300( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram301( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram302( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram303( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram304( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram305( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram306( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram307( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram308( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram309( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram310( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram311( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram312( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram313( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram314( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram315( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram316( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram317( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram318( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram319( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram320( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram321( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram322( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram323( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram324( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram325( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram326( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram327( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram328( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram329( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram330( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram331( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram332( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram333( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram334( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram335( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram336( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram337( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram338( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram339( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram340( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram341( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram342( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram343( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram344( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram345( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram346( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram347( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram348( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram349( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram350( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram351( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram352( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram353( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram354( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram355( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram356( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram357( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram358( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram359( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram360( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram361( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram362( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram363( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram364( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram365( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram366( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram367( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram368( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram369( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram370( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram371( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram372( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram373( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram374( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram375( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram376( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram377( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram378( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram379( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram380( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram381( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram382( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram383( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram384( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram385( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram386( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram387( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram388( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram389( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram390( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram391( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram392( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram393( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram394( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram395( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram396( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram397( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram398( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram399( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram400( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram401( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram402( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram403( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram404( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram405( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram406( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram407( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram408( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram409( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram410( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram411( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram412( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram413( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram414( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram415( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram416( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram417( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram418( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram419( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram420( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram421( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram422( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram423( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram424( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram425( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram426( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram427( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram428( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram429( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram430( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram431( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram432( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram433( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram434( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram435( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram436( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram437( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram438( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram439( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram440( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram441( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram442( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram443( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram444( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram445( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram446( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram447( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram448( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram449( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram450( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram451( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram452( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram453( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram454( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram455( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram456( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram457( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram458( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram459( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram460( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram461( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram462( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram463( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram464( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram465( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram466( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram467( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram468( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram469( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram470( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram471( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram472( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram473( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram474( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram475( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram476( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram477( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram478( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram479( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram480( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram481( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram482( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram483( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram484( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram485( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram486( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram487( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram488( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram489( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram490( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram491( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram492( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram493( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram494( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram495( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram496( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram497( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram498( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram499( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram500( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram501( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram502( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram503( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram504( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram505( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram506( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram507( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram508( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram509( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram510( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram511( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram512( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram513( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram514( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram515( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram516( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram517( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram518( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram519( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram520( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram521( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram522( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram523( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram524( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram525( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram526( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram527( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram528( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram529( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram530( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram531( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram532( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram533( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram534( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram535( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram536( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram537( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram538( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram539( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram540( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram541( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram542( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram543( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram544( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram545( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram546( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram547( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram548( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram549( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram550( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram551( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram552( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram553( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram554( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram555( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram556( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram557( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram558( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram559( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram560( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram561( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram562( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram563( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram564( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram565( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram566( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram567( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram568( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram569( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram570( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram571( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram572( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram573( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram574( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram575( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram576( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram577( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram578( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram579( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram580( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram581( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram582( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram583( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram584( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram585( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram586( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram587( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram588( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram589( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram590( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram591( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram592( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram593( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram594( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram595( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram596( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram597( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram598( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram599( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram600( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram601( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram602( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram603( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram604( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram605( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram606( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram607( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram608( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram609( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram610( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram611( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram612( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram613( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram614( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram615( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram616( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram617( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram618( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram619( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram620( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram621( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram622( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram623( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram624( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram625( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram626( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram627( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram628( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram629( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram630( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram631( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram632( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram633( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram634( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram635( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram636( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram637( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram638( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram639( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram640( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram641( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram642( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram643( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram644( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram645( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram646( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram647( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram648( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram649( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram650( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram651( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram652( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram653( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram654( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram655( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram656( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram657( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram658( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram659( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram660( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram661( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram662( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram663( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram664( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram665( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram666( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram667( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram668( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram669( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram670( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram671( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram672( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram673( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram674( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram675( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram676( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram677( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram678( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram679( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram680( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram681( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram682( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram683( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram684( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram685( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram686( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram687( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram688( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram689( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram690( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram691( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram692( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram693( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram694( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram695( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram696( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram697( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram698( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram699( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram700( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram701( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram702( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram703( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram704( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram705( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram706( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram707( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram708( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram709( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram710( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram711( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram712( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram713( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram714( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram715( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram716( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram717( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram718( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram719( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram720( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram721( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram722( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram723( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram724( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram725( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram726( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram727( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram728( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram729( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram730( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram731( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram732( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram733( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram734( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram735( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram736( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram737( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram738( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram739( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram740( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram741( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram742( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram743( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram744( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram745( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram746( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram747( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram748( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram749( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram750( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram751( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram752( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram753( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram754( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram755( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram756( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram757( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram758( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram759( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram760( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram761( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram762( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram763( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram764( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram765( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram766( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram767( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram768( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram769( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram770( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram771( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram772( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram773( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram774( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram775( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram776( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram777( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram778( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram779( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram780( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram781( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram782( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram783( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram784( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram785( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram786( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram787( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram788( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram789( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram790( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram791( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram792( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram793( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram794( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram795( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram796( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram797( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram798( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram799( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram800( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram801( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram802( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram803( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram804( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram805( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram806( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram807( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram808( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram809( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram810( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram811( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram812( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram813( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram814( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram815( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram816( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram817( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram818( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram819( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram820( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram821( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram822( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram823( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram824( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram825( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram826( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram827( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram828( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram829( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram830( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram831( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram832( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram833( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram834( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram835( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram836( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram837( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram838( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram839( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram840( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram841( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram842( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram843( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram844( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram845( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram846( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram847( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram848( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram849( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram850( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram851( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram852( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram853( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram854( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram855( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram856( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram857( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram858( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram859( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram860( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram861( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram862( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram863( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram864( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram865( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram866( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram867( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram868( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram869( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram870( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram871( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram872( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram873( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram874( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram875( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram876( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram877( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram878( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram879( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram880( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram881( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram882( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram883( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram884( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram885( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram886( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram887( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram888( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram889( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram890( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram891( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram892( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram893( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram894( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram895( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram896( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram897( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram898( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram899( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram900( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram901( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram902( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram903( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram904( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram905( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram906( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram907( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram908( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram909( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram910( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram911( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram912( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram913( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram914( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram915( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram916( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram917( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram918( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram919( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram920( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram921( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram922( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram923( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram924( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram925( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram926( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram927( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram928( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram929( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram930( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram931( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram932( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram933( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram934( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram935( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram936( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram937( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram938( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram939( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram940( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram941( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram942( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram943( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram944( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram945( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram946( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram947( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram948( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram949( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram950( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram951( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram952( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram953( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram954( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram955( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram956( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram957( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram958( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram959( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram960( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram961( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram962( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram963( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram964( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram965( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram966( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram967( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram968( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram969( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram970( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram971( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram972( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram973( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram974( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram975( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram976( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram977( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram978( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram979( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram980( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram981( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram982( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram983( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram984( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram985( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram986( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram987( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram988( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram989( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram990( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram991( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram992( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram993( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram994( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram995( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram996( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram997( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram998( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram999( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1000( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1001( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1002( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1003( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1004( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1005( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1006( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1007( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1008( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1009( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1010( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1011( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1012( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1013( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1014( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1015( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1016( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1017( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1018( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1019( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1020( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1021( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1022( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1023( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1024( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1025( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1026( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1027( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1028( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1029( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1030( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1031( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1032( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1033( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1034( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1035( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1036( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1037( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1038( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1039( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1040( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1041( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1042( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1043( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1044( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1045( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1046( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1047( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1048( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1049( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1050( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1051( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1052( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1053( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1054( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1055( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1056( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1057( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1058( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1059( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1060( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1061( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1062( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1063( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1064( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1065( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1066( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1067( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1068( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1069( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1070( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1071( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1072( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1073( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1074( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1075( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1076( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1077( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1078( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1079( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1080( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1081( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1082( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1083( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1084( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1085( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1086( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1087( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1088( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1089( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1090( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1091( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1092( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1093( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1094( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1095( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1096( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1097( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1098( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1099( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1100( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1101( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1102( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1103( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1104( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1105( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1106( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1107( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1108( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1109( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1110( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1111( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1112( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1113( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1114( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1115( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1116( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1117( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1118( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1119( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1120( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1121( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1122( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1123( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1124( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1125( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1126( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1127( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1128( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1129( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1130( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1131( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1132( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1133( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1134( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1135( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1136( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1137( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1138( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1139( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1140( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1141( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1142( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1143( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1144( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1145( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1146( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1147( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1148( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1149( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1150( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1151( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1152( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1153( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1154( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1155( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1156( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1157( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1158( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1159( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1160( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1161( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1162( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1163( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1164( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1165( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1166( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1167( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1168( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1169( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1170( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1171( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1172( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1173( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1174( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1175( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1176( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1177( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1178( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1179( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1180( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1181( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1182( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1183( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1184( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1185( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1186( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1187( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1188( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1189( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1190( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1191( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1192( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1193( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1194( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1195( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1196( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1197( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1198( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1199( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1200( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1201( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1202( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1203( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1204( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1205( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1206( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1207( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1208( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1209( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1210( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1211( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1212( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1213( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1214( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1215( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1216( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1217( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1218( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1219( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1220( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1221( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1222( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1223( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1224( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1225( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1226( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1227( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1228( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1229( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1230( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1231( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1232( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1233( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1234( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1235( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1236( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1237( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1238( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1239( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1240( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -30383,7 +3036,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -30419,6 +3076,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -30462,6 +3123,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -30564,26 +3229,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -30591,25 +3256,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -30747,20 +3616,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -30772,17 +3635,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -30808,93 +3674,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -30936,7 +3772,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -30959,7 +3795,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -30968,25 +3804,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -30996,8 +3838,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -31013,11 +3857,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -31119,14 +3964,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 2eb1e066ff..6b99d481e4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 128; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 1240; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 120; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 121; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 7; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 1890; //static const int ncomb = 128; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f index 523ef1948b..5058fe1f57 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f index 3152176aa0..37f74434ee 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc new file mode 100644 index 0000000000..c027c38503 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc @@ -0,0 +1,501 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=120) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120] + + // The color matrix (initialize all array elements, with ncolor=120) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 }, + { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 }, + { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 }, + { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 }, + { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 }, + { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 }, + { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 }, + { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 }, + { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 }, + { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 }, + { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 }, + { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 }, + { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 }, + { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 }, + { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 }, + { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 }, + { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 }, + { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 }, + { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 }, + { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 }, + { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 }, + { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 }, + { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 }, + { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 }, + { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 }, + { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 }, + { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 }, + { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 }, + { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 }, + { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 }, + { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 }, + { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 }, + { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 }, + { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 }, + { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 }, + { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 }, + { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 }, + { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 }, + { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 }, + { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 }, + { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 }, + { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 }, + { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 }, + { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 }, + { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 }, + { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 }, + { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 }, + { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 }, + { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 }, + { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 }, + { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 }, + { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 }, + { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 }, + { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 }, + { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 }, + { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 }, + { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 }, + { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 }, + { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 }, + { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 }, + { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 }, + { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 }, + { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 }, + { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 }, + { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 }, + { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 }, + { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 }, + { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 }, + { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 }, + { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 }, + { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 }, + { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 }, + { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 }, + { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 }, + { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 }, + { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 }, + { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 }, + { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 }, + { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 }, + { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 }, + { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 }, + { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 }, + { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 }, + { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 }, + { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 }, + { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 }, + { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 }, + { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 }, + { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 }, + { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 }, + { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 }, + { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 }, + { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 }, + { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 }, + { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 }, + { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 }, + { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 }, + { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 }, + { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 }, + { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 }, + { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 }, + { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 }, + { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 }, + { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 }, + { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 }, + { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 }, + { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 }, + { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 }, + { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 }, + { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 }, + { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 }, + { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 }, + { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 }, + { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 }, + { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 }, + { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 }, + { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 }, + { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 }, + { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, + { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagrams.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagrams.h new file mode 100644 index 0000000000..1ff425b7f7 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagrams.h @@ -0,0 +1,49496 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 1240 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + vxxxxx( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + VVV1P0_1( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] ); + VVV1P0_1( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 1 + VVV1_0( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 1240 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 2 + VVV1_0( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 1240 *** + // Wavefunction(s) for diagram number 3 + // (none) + // Amplitude(s) for diagram number 3 + VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 1240 *** + // Wavefunction(s) for diagram number 4 + VVV1P0_1( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] ); + VVV1P0_1( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] ); + // Amplitude(s) for diagram number 4 + VVV1_0( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 1240 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 1240 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 1240 *** + // Wavefunction(s) for diagram number 7 + VVV1P0_1( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 7 + VVV1_0( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 1240 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + VVV1_0( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 1240 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 1240 *** + // Wavefunction(s) for diagram number 10 + VVVV1P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); + VVVV3P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] ); + VVVV4P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); + // Amplitude(s) for diagram number 10 + VVV1_0( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 1240 *** + // Wavefunction(s) for diagram number 11 + VVVV1P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] ); + VVVV3P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] ); + VVVV4P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] ); + // Amplitude(s) for diagram number 11 + VVV1_0( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 1240 *** + // Wavefunction(s) for diagram number 12 + VVVV1P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] ); + VVVV4P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 12 + VVV1_0( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 1240 *** + // Wavefunction(s) for diagram number 13 + VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 13 + VVVV1_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 1240 *** + // Wavefunction(s) for diagram number 14 + VVV1P0_1( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] ); + // Amplitude(s) for diagram number 14 + VVV1_0( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 1240 *** + // Wavefunction(s) for diagram number 15 + VVV1P0_1( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] ); + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 1240 *** + // Wavefunction(s) for diagram number 16 + // (none) + // Amplitude(s) for diagram number 16 + VVV1_0( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 1240 *** + // Wavefunction(s) for diagram number 17 + VVV1P0_1( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] ); + // Amplitude(s) for diagram number 17 + VVVV1_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + VVVV3_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVVV4_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 1240 *** + // Wavefunction(s) for diagram number 18 + // (none) + // Amplitude(s) for diagram number 18 + VVV1_0( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 1240 *** + // Wavefunction(s) for diagram number 19 + VVV1P0_1( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] ); + // Amplitude(s) for diagram number 19 + VVV1_0( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 1240 *** + // Wavefunction(s) for diagram number 20 + // (none) + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 1240 *** + // Wavefunction(s) for diagram number 21 + VVV1P0_1( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] ); + // Amplitude(s) for diagram number 21 + VVVV1_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 1240 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + VVV1_0( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 1240 *** + // Wavefunction(s) for diagram number 23 + // (none) + // Amplitude(s) for diagram number 23 + VVV1_0( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 1240 *** + // Wavefunction(s) for diagram number 24 + VVV1P0_1( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] ); + // Amplitude(s) for diagram number 24 + VVV1_0( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 1240 *** + // Wavefunction(s) for diagram number 25 + VVVV1P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] ); + VVVV3P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] ); + VVVV4P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] ); + // Amplitude(s) for diagram number 25 + VVV1_0( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVV1_0( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 1240 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] ); + FFV1_2( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); + FFV1_1( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 1240 *** + // Wavefunction(s) for diagram number 27 + FFV1_1( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] ); + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 1240 *** + // Wavefunction(s) for diagram number 28 + FFV1P0_3( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] ); + // Amplitude(s) for diagram number 28 + VVV1_0( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 1240 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 1240 *** + // Wavefunction(s) for diagram number 30 + // (none) + // Amplitude(s) for diagram number 30 + VVV1_0( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 1240 *** + // Wavefunction(s) for diagram number 31 + // (none) + // Amplitude(s) for diagram number 31 + FFV1_0( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 1240 *** + // Wavefunction(s) for diagram number 32 + // (none) + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 1240 *** + // Wavefunction(s) for diagram number 33 + FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] ); + FFV1_1( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] ); + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 1240 *** + // Wavefunction(s) for diagram number 34 + FFV1_2( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] ); + // Amplitude(s) for diagram number 34 + FFV1_0( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 1240 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + FFV1_0( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 1240 *** + // Wavefunction(s) for diagram number 36 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] ); + // Amplitude(s) for diagram number 36 + FFV1_0( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram37( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 37 OF 1240 *** + // Wavefunction(s) for diagram number 37 + FFV1_2( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] ); + // Amplitude(s) for diagram number 37 + FFV1_0( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram38( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 38 OF 1240 *** + // Wavefunction(s) for diagram number 38 + // (none) + // Amplitude(s) for diagram number 38 + FFV1_0( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram39( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 39 OF 1240 *** + // Wavefunction(s) for diagram number 39 + // (none) + // Amplitude(s) for diagram number 39 + FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram40( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 40 OF 1240 *** + // Wavefunction(s) for diagram number 40 + // (none) + // Amplitude(s) for diagram number 40 + FFV1_0( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram41( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 41 OF 1240 *** + // Wavefunction(s) for diagram number 41 + // (none) + // Amplitude(s) for diagram number 41 + FFV1_0( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram42( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 42 OF 1240 *** + // Wavefunction(s) for diagram number 42 + FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] ); + FFV1_1( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] ); + // Amplitude(s) for diagram number 42 + FFV1_0( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram43( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 43 OF 1240 *** + // Wavefunction(s) for diagram number 43 + FFV1_1( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] ); + // Amplitude(s) for diagram number 43 + FFV1_0( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram44( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 44 OF 1240 *** + // Wavefunction(s) for diagram number 44 + FFV1P0_3( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] ); + // Amplitude(s) for diagram number 44 + VVV1_0( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram45( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 45 OF 1240 *** + // Wavefunction(s) for diagram number 45 + // (none) + // Amplitude(s) for diagram number 45 + FFV1_0( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram46( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 46 OF 1240 *** + // Wavefunction(s) for diagram number 46 + // (none) + // Amplitude(s) for diagram number 46 + VVV1_0( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram47( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 47 OF 1240 *** + // Wavefunction(s) for diagram number 47 + // (none) + // Amplitude(s) for diagram number 47 + FFV1_0( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram48( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 48 OF 1240 *** + // Wavefunction(s) for diagram number 48 + // (none) + // Amplitude(s) for diagram number 48 + FFV1_0( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram49( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 49 OF 1240 *** + // Wavefunction(s) for diagram number 49 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] ); + FFV1_1( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] ); + // Amplitude(s) for diagram number 49 + FFV1_0( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram50( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 50 OF 1240 *** + // Wavefunction(s) for diagram number 50 + FFV1_2( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] ); + // Amplitude(s) for diagram number 50 + FFV1_0( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram51( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 51 OF 1240 *** + // Wavefunction(s) for diagram number 51 + // (none) + // Amplitude(s) for diagram number 51 + FFV1_0( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram52( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 52 OF 1240 *** + // Wavefunction(s) for diagram number 52 + // (none) + // Amplitude(s) for diagram number 52 + FFV1_0( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram53( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 53 OF 1240 *** + // Wavefunction(s) for diagram number 53 + // (none) + // Amplitude(s) for diagram number 53 + FFV1_0( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram54( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 54 OF 1240 *** + // Wavefunction(s) for diagram number 54 + // (none) + // Amplitude(s) for diagram number 54 + FFV1_0( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram55( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 55 OF 1240 *** + // Wavefunction(s) for diagram number 55 + // (none) + // Amplitude(s) for diagram number 55 + FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram56( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 56 OF 1240 *** + // Wavefunction(s) for diagram number 56 + // (none) + // Amplitude(s) for diagram number 56 + FFV1_0( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram57( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 57 OF 1240 *** + // Wavefunction(s) for diagram number 57 + // (none) + // Amplitude(s) for diagram number 57 + FFV1_0( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram58( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 58 OF 1240 *** + // Wavefunction(s) for diagram number 58 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] ); + FFV1_1( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] ); + // Amplitude(s) for diagram number 58 + FFV1_0( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram59( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 59 OF 1240 *** + // Wavefunction(s) for diagram number 59 + FFV1_1( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] ); + // Amplitude(s) for diagram number 59 + FFV1_0( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram60( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 60 OF 1240 *** + // Wavefunction(s) for diagram number 60 + FFV1P0_3( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] ); + // Amplitude(s) for diagram number 60 + VVV1_0( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram61( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 61 OF 1240 *** + // Wavefunction(s) for diagram number 61 + // (none) + // Amplitude(s) for diagram number 61 + FFV1_0( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram62( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 62 OF 1240 *** + // Wavefunction(s) for diagram number 62 + // (none) + // Amplitude(s) for diagram number 62 + VVV1_0( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram63( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 63 OF 1240 *** + // Wavefunction(s) for diagram number 63 + // (none) + // Amplitude(s) for diagram number 63 + FFV1_0( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram64( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 64 OF 1240 *** + // Wavefunction(s) for diagram number 64 + // (none) + // Amplitude(s) for diagram number 64 + FFV1_0( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram65( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 65 OF 1240 *** + // Wavefunction(s) for diagram number 65 + FFV1_1( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); + // Amplitude(s) for diagram number 65 + FFV1_0( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram66( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 66 OF 1240 *** + // Wavefunction(s) for diagram number 66 + // (none) + // Amplitude(s) for diagram number 66 + FFV1_0( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram67( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 67 OF 1240 *** + // Wavefunction(s) for diagram number 67 + // (none) + // Amplitude(s) for diagram number 67 + FFV1_0( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram68( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 68 OF 1240 *** + // Wavefunction(s) for diagram number 68 + // (none) + // Amplitude(s) for diagram number 68 + FFV1_0( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram69( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 69 OF 1240 *** + // Wavefunction(s) for diagram number 69 + // (none) + // Amplitude(s) for diagram number 69 + FFV1_0( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram70( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 70 OF 1240 *** + // Wavefunction(s) for diagram number 70 + // (none) + // Amplitude(s) for diagram number 70 + FFV1_0( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram71( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 71 OF 1240 *** + // Wavefunction(s) for diagram number 71 + // (none) + // Amplitude(s) for diagram number 71 + FFV1_0( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram72( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 72 OF 1240 *** + // Wavefunction(s) for diagram number 72 + // (none) + // Amplitude(s) for diagram number 72 + FFV1_0( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram73( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 73 OF 1240 *** + // Wavefunction(s) for diagram number 73 + // (none) + // Amplitude(s) for diagram number 73 + FFV1_0( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram74( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 74 OF 1240 *** + // Wavefunction(s) for diagram number 74 + FFV1_1( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); + FFV1_2( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); + // Amplitude(s) for diagram number 74 + FFV1_0( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 74 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram75( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 75 OF 1240 *** + // Wavefunction(s) for diagram number 75 + FFV1_2( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] ); + // Amplitude(s) for diagram number 75 + FFV1_0( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram76( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 76 OF 1240 *** + // Wavefunction(s) for diagram number 76 + FFV1P0_3( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] ); + // Amplitude(s) for diagram number 76 + VVV1_0( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram77( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 77 OF 1240 *** + // Wavefunction(s) for diagram number 77 + // (none) + // Amplitude(s) for diagram number 77 + FFV1_0( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram78( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 78 OF 1240 *** + // Wavefunction(s) for diagram number 78 + // (none) + // Amplitude(s) for diagram number 78 + VVV1_0( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram79( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 79 OF 1240 *** + // Wavefunction(s) for diagram number 79 + // (none) + // Amplitude(s) for diagram number 79 + FFV1_0( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram80( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 80 OF 1240 *** + // Wavefunction(s) for diagram number 80 + // (none) + // Amplitude(s) for diagram number 80 + FFV1_0( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram81( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 81 OF 1240 *** + // Wavefunction(s) for diagram number 81 + // (none) + // Amplitude(s) for diagram number 81 + FFV1_0( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram82( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 82 OF 1240 *** + // Wavefunction(s) for diagram number 82 + // (none) + // Amplitude(s) for diagram number 82 + FFV1_0( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram83( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 83 OF 1240 *** + // Wavefunction(s) for diagram number 83 + // (none) + // Amplitude(s) for diagram number 83 + FFV1_0( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram84( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 84 OF 1240 *** + // Wavefunction(s) for diagram number 84 + FFV1_2( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] ); + // Amplitude(s) for diagram number 84 + FFV1_0( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram85( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 85 OF 1240 *** + // Wavefunction(s) for diagram number 85 + FFV1_2( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] ); + // Amplitude(s) for diagram number 85 + FFV1_0( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram86( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 86 OF 1240 *** + // Wavefunction(s) for diagram number 86 + FFV1P0_3( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 86 + VVV1_0( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram87( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 87 OF 1240 *** + // Wavefunction(s) for diagram number 87 + // (none) + // Amplitude(s) for diagram number 87 + FFV1_0( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram88( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 88 OF 1240 *** + // Wavefunction(s) for diagram number 88 + // (none) + // Amplitude(s) for diagram number 88 + VVV1_0( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram89( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 89 OF 1240 *** + // Wavefunction(s) for diagram number 89 + // (none) + // Amplitude(s) for diagram number 89 + FFV1_0( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram90( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 90 OF 1240 *** + // Wavefunction(s) for diagram number 90 + // (none) + // Amplitude(s) for diagram number 90 + FFV1_0( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram91( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 91 OF 1240 *** + // Wavefunction(s) for diagram number 91 + // (none) + // Amplitude(s) for diagram number 91 + FFV1_0( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram92( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 92 OF 1240 *** + // Wavefunction(s) for diagram number 92 + // (none) + // Amplitude(s) for diagram number 92 + FFV1_0( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram93( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 93 OF 1240 *** + // Wavefunction(s) for diagram number 93 + // (none) + // Amplitude(s) for diagram number 93 + FFV1_0( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 93 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram94( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 94 OF 1240 *** + // Wavefunction(s) for diagram number 94 + FFV1_2( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] ); + // Amplitude(s) for diagram number 94 + FFV1_0( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram95( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 95 OF 1240 *** + // Wavefunction(s) for diagram number 95 + FFV1_2( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] ); + // Amplitude(s) for diagram number 95 + FFV1_0( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram96( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 96 OF 1240 *** + // Wavefunction(s) for diagram number 96 + FFV1P0_3( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] ); + // Amplitude(s) for diagram number 96 + VVV1_0( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram97( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 97 OF 1240 *** + // Wavefunction(s) for diagram number 97 + // (none) + // Amplitude(s) for diagram number 97 + FFV1_0( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram98( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 98 OF 1240 *** + // Wavefunction(s) for diagram number 98 + // (none) + // Amplitude(s) for diagram number 98 + VVV1_0( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram99( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 99 OF 1240 *** + // Wavefunction(s) for diagram number 99 + // (none) + // Amplitude(s) for diagram number 99 + FFV1_0( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram100( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 100 OF 1240 *** + // Wavefunction(s) for diagram number 100 + // (none) + // Amplitude(s) for diagram number 100 + FFV1_0( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram101( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 101 OF 1240 *** + // Wavefunction(s) for diagram number 101 + // (none) + // Amplitude(s) for diagram number 101 + FFV1_0( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram102( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 102 OF 1240 *** + // Wavefunction(s) for diagram number 102 + // (none) + // Amplitude(s) for diagram number 102 + FFV1_0( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram103( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 103 OF 1240 *** + // Wavefunction(s) for diagram number 103 + // (none) + // Amplitude(s) for diagram number 103 + FFV1_0( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram104( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 104 OF 1240 *** + // Wavefunction(s) for diagram number 104 + FFV1_2( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] ); + // Amplitude(s) for diagram number 104 + FFV1_0( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram105( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 105 OF 1240 *** + // Wavefunction(s) for diagram number 105 + VVV1P0_1( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] ); + // Amplitude(s) for diagram number 105 + FFV1_0( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram106( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 106 OF 1240 *** + // Wavefunction(s) for diagram number 106 + FFV1_1( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); + // Amplitude(s) for diagram number 106 + FFV1_0( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram107( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 107 OF 1240 *** + // Wavefunction(s) for diagram number 107 + // (none) + // Amplitude(s) for diagram number 107 + FFV1_0( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 107 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram108( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 108 OF 1240 *** + // Wavefunction(s) for diagram number 108 + // (none) + // Amplitude(s) for diagram number 108 + FFV1_0( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram109( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 109 OF 1240 *** + // Wavefunction(s) for diagram number 109 + // (none) + // Amplitude(s) for diagram number 109 + FFV1_0( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram110( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 110 OF 1240 *** + // Wavefunction(s) for diagram number 110 + FFV1_2( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 110 + FFV1_0( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram111( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 111 OF 1240 *** + // Wavefunction(s) for diagram number 111 + VVV1P0_1( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] ); + // Amplitude(s) for diagram number 111 + FFV1_0( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram112( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 112 OF 1240 *** + // Wavefunction(s) for diagram number 112 + FFV1_1( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); + // Amplitude(s) for diagram number 112 + FFV1_0( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram113( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 113 OF 1240 *** + // Wavefunction(s) for diagram number 113 + // (none) + // Amplitude(s) for diagram number 113 + FFV1_0( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram114( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 114 OF 1240 *** + // Wavefunction(s) for diagram number 114 + // (none) + // Amplitude(s) for diagram number 114 + FFV1_0( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 114 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram115( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 115 OF 1240 *** + // Wavefunction(s) for diagram number 115 + // (none) + // Amplitude(s) for diagram number 115 + FFV1_0( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 115 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram116( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 116 OF 1240 *** + // Wavefunction(s) for diagram number 116 + FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 116 + FFV1_0( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 116 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram117( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 117 OF 1240 *** + // Wavefunction(s) for diagram number 117 + VVV1P0_1( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] ); + // Amplitude(s) for diagram number 117 + FFV1_0( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 117 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram118( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 118 OF 1240 *** + // Wavefunction(s) for diagram number 118 + FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] ); + // Amplitude(s) for diagram number 118 + FFV1_0( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 118 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram119( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 119 OF 1240 *** + // Wavefunction(s) for diagram number 119 + // (none) + // Amplitude(s) for diagram number 119 + FFV1_0( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 119 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram120( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 120 OF 1240 *** + // Wavefunction(s) for diagram number 120 + // (none) + // Amplitude(s) for diagram number 120 + FFV1_0( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 120 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram121( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 121 OF 1240 *** + // Wavefunction(s) for diagram number 121 + // (none) + // Amplitude(s) for diagram number 121 + FFV1_0( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 121 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram122( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 122 OF 1240 *** + // Wavefunction(s) for diagram number 122 + // (none) + // Amplitude(s) for diagram number 122 + FFV1_0( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram123( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 123 OF 1240 *** + // Wavefunction(s) for diagram number 123 + // (none) + // Amplitude(s) for diagram number 123 + FFV1_0( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram124( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 124 OF 1240 *** + // Wavefunction(s) for diagram number 124 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); + FFV1_1( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); + // Amplitude(s) for diagram number 124 + FFV1_0( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 124 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram125( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 125 OF 1240 *** + // Wavefunction(s) for diagram number 125 + FFV1_2( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 125 + FFV1_0( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 125 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram126( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 126 OF 1240 *** + // Wavefunction(s) for diagram number 126 + FFV1_1( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] ); + FFV1_2( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] ); + // Amplitude(s) for diagram number 126 + FFV1_0( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 126 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram127( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 127 OF 1240 *** + // Wavefunction(s) for diagram number 127 + // (none) + // Amplitude(s) for diagram number 127 + FFV1_0( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 127 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram128( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 128 OF 1240 *** + // Wavefunction(s) for diagram number 128 + FFV1_1( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] ); + // Amplitude(s) for diagram number 128 + FFV1_0( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 128 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram129( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 129 OF 1240 *** + // Wavefunction(s) for diagram number 129 + // (none) + // Amplitude(s) for diagram number 129 + FFV1_0( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 129 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram130( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 130 OF 1240 *** + // Wavefunction(s) for diagram number 130 + FFV1P0_3( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] ); + // Amplitude(s) for diagram number 130 + VVV1_0( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 130 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram131( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 131 OF 1240 *** + // Wavefunction(s) for diagram number 131 + FFV1_1( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); + // Amplitude(s) for diagram number 131 + FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 131 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram132( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 132 OF 1240 *** + // Wavefunction(s) for diagram number 132 + // (none) + // Amplitude(s) for diagram number 132 + FFV1_0( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 132 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram133( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 133 OF 1240 *** + // Wavefunction(s) for diagram number 133 + // (none) + // Amplitude(s) for diagram number 133 + VVV1_0( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 133 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram134( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 134 OF 1240 *** + // Wavefunction(s) for diagram number 134 + FFV1_1( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); + // Amplitude(s) for diagram number 134 + FFV1_0( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 134 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram135( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 135 OF 1240 *** + // Wavefunction(s) for diagram number 135 + // (none) + // Amplitude(s) for diagram number 135 + FFV1_0( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 135 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram136( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 136 OF 1240 *** + // Wavefunction(s) for diagram number 136 + // (none) + // Amplitude(s) for diagram number 136 + VVV1_0( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 136 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram137( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 137 OF 1240 *** + // Wavefunction(s) for diagram number 137 + // (none) + // Amplitude(s) for diagram number 137 + FFV1_0( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 137 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram138( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 138 OF 1240 *** + // Wavefunction(s) for diagram number 138 + FFV1_1( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] ); + // Amplitude(s) for diagram number 138 + FFV1_0( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 138 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram139( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 139 OF 1240 *** + // Wavefunction(s) for diagram number 139 + // (none) + // Amplitude(s) for diagram number 139 + FFV1_0( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram140( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 140 OF 1240 *** + // Wavefunction(s) for diagram number 140 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] ); + FFV1P0_3( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] ); + VVV1P0_1( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] ); + // Amplitude(s) for diagram number 140 + VVV1_0( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 140 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram141( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 141 OF 1240 *** + // Wavefunction(s) for diagram number 141 + VVV1P0_1( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] ); + // Amplitude(s) for diagram number 141 + VVV1_0( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 141 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram142( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 142 OF 1240 *** + // Wavefunction(s) for diagram number 142 + // (none) + // Amplitude(s) for diagram number 142 + VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram143( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 143 OF 1240 *** + // Wavefunction(s) for diagram number 143 + FFV1_2( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] ); + // Amplitude(s) for diagram number 143 + FFV1_0( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 143 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram144( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 144 OF 1240 *** + // Wavefunction(s) for diagram number 144 + // (none) + // Amplitude(s) for diagram number 144 + FFV1_0( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 144 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram145( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 145 OF 1240 *** + // Wavefunction(s) for diagram number 145 + // (none) + // Amplitude(s) for diagram number 145 + FFV1_0( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 145 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram146( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 146 OF 1240 *** + // Wavefunction(s) for diagram number 146 + // (none) + // Amplitude(s) for diagram number 146 + FFV1_0( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 146 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram147( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 147 OF 1240 *** + // Wavefunction(s) for diagram number 147 + FFV1_1( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] ); + // Amplitude(s) for diagram number 147 + FFV1_0( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 147 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram148( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 148 OF 1240 *** + // Wavefunction(s) for diagram number 148 + FFV1P0_3( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] ); + // Amplitude(s) for diagram number 148 + VVV1_0( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 148 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram149( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 149 OF 1240 *** + // Wavefunction(s) for diagram number 149 + // (none) + // Amplitude(s) for diagram number 149 + FFV1_0( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 149 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram150( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 150 OF 1240 *** + // Wavefunction(s) for diagram number 150 + // (none) + // Amplitude(s) for diagram number 150 + FFV1_0( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 150 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram151( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 151 OF 1240 *** + // Wavefunction(s) for diagram number 151 + FFV1P0_3( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] ); + // Amplitude(s) for diagram number 151 + VVV1_0( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 151 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram152( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 152 OF 1240 *** + // Wavefunction(s) for diagram number 152 + // (none) + // Amplitude(s) for diagram number 152 + FFV1_0( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 152 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram153( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 153 OF 1240 *** + // Wavefunction(s) for diagram number 153 + // (none) + // Amplitude(s) for diagram number 153 + FFV1_0( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 153 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram154( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 154 OF 1240 *** + // Wavefunction(s) for diagram number 154 + // (none) + // Amplitude(s) for diagram number 154 + VVV1_0( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 154 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram155( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 155 OF 1240 *** + // Wavefunction(s) for diagram number 155 + // (none) + // Amplitude(s) for diagram number 155 + FFV1_0( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 155 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram156( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 156 OF 1240 *** + // Wavefunction(s) for diagram number 156 + VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] ); + VVV1P0_1( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] ); + // Amplitude(s) for diagram number 156 + VVV1_0( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 156 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram157( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 157 OF 1240 *** + // Wavefunction(s) for diagram number 157 + VVV1P0_1( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] ); + // Amplitude(s) for diagram number 157 + VVV1_0( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 157 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram158( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 158 OF 1240 *** + // Wavefunction(s) for diagram number 158 + // (none) + // Amplitude(s) for diagram number 158 + VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram159( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 159 OF 1240 *** + // Wavefunction(s) for diagram number 159 + FFV1_2( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); + // Amplitude(s) for diagram number 159 + FFV1_0( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 159 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram160( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 160 OF 1240 *** + // Wavefunction(s) for diagram number 160 + // (none) + // Amplitude(s) for diagram number 160 + FFV1_0( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 160 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram161( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 161 OF 1240 *** + // Wavefunction(s) for diagram number 161 + // (none) + // Amplitude(s) for diagram number 161 + FFV1_0( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 161 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram162( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 162 OF 1240 *** + // Wavefunction(s) for diagram number 162 + // (none) + // Amplitude(s) for diagram number 162 + FFV1_0( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 162 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram163( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 163 OF 1240 *** + // Wavefunction(s) for diagram number 163 + FFV1_1( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] ); + // Amplitude(s) for diagram number 163 + FFV1_0( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 163 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram164( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 164 OF 1240 *** + // Wavefunction(s) for diagram number 164 + FFV1P0_3( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] ); + // Amplitude(s) for diagram number 164 + VVV1_0( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 164 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram165( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 165 OF 1240 *** + // Wavefunction(s) for diagram number 165 + // (none) + // Amplitude(s) for diagram number 165 + FFV1_0( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 165 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram166( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 166 OF 1240 *** + // Wavefunction(s) for diagram number 166 + // (none) + // Amplitude(s) for diagram number 166 + FFV1_0( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 166 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram167( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 167 OF 1240 *** + // Wavefunction(s) for diagram number 167 + // (none) + // Amplitude(s) for diagram number 167 + VVV1_0( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 167 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram168( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 168 OF 1240 *** + // Wavefunction(s) for diagram number 168 + // (none) + // Amplitude(s) for diagram number 168 + FFV1_0( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 168 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram169( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 169 OF 1240 *** + // Wavefunction(s) for diagram number 169 + // (none) + // Amplitude(s) for diagram number 169 + FFV1_0( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 169 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram170( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 170 OF 1240 *** + // Wavefunction(s) for diagram number 170 + // (none) + // Amplitude(s) for diagram number 170 + VVV1_0( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 170 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram171( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 171 OF 1240 *** + // Wavefunction(s) for diagram number 171 + // (none) + // Amplitude(s) for diagram number 171 + FFV1_0( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 171 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram172( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 172 OF 1240 *** + // Wavefunction(s) for diagram number 172 + VVV1P0_1( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] ); + VVV1P0_1( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] ); + // Amplitude(s) for diagram number 172 + VVV1_0( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 172 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram173( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 173 OF 1240 *** + // Wavefunction(s) for diagram number 173 + VVV1P0_1( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] ); + // Amplitude(s) for diagram number 173 + VVV1_0( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 173 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram174( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 174 OF 1240 *** + // Wavefunction(s) for diagram number 174 + // (none) + // Amplitude(s) for diagram number 174 + VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram175( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 175 OF 1240 *** + // Wavefunction(s) for diagram number 175 + FFV1_2( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] ); + // Amplitude(s) for diagram number 175 + FFV1_0( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 175 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram176( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 176 OF 1240 *** + // Wavefunction(s) for diagram number 176 + // (none) + // Amplitude(s) for diagram number 176 + FFV1_0( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 176 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram177( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 177 OF 1240 *** + // Wavefunction(s) for diagram number 177 + // (none) + // Amplitude(s) for diagram number 177 + FFV1_0( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 177 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram178( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 178 OF 1240 *** + // Wavefunction(s) for diagram number 178 + // (none) + // Amplitude(s) for diagram number 178 + FFV1_0( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 178 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram179( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 179 OF 1240 *** + // Wavefunction(s) for diagram number 179 + FFV1_1( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); + // Amplitude(s) for diagram number 179 + FFV1_0( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 179 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram180( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 180 OF 1240 *** + // Wavefunction(s) for diagram number 180 + // (none) + // Amplitude(s) for diagram number 180 + VVV1_0( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 180 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram181( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 181 OF 1240 *** + // Wavefunction(s) for diagram number 181 + // (none) + // Amplitude(s) for diagram number 181 + FFV1_0( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 181 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram182( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 182 OF 1240 *** + // Wavefunction(s) for diagram number 182 + // (none) + // Amplitude(s) for diagram number 182 + FFV1_0( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 182 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram183( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 183 OF 1240 *** + // Wavefunction(s) for diagram number 183 + // (none) + // Amplitude(s) for diagram number 183 + VVV1_0( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 183 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram184( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 184 OF 1240 *** + // Wavefunction(s) for diagram number 184 + // (none) + // Amplitude(s) for diagram number 184 + FFV1_0( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 184 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram185( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 185 OF 1240 *** + // Wavefunction(s) for diagram number 185 + // (none) + // Amplitude(s) for diagram number 185 + FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 185 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram186( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 186 OF 1240 *** + // Wavefunction(s) for diagram number 186 + // (none) + // Amplitude(s) for diagram number 186 + VVV1_0( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 186 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram187( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 187 OF 1240 *** + // Wavefunction(s) for diagram number 187 + // (none) + // Amplitude(s) for diagram number 187 + FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 187 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram188( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 188 OF 1240 *** + // Wavefunction(s) for diagram number 188 + FFV1_1( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); + // Amplitude(s) for diagram number 188 + FFV1_0( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 188 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram189( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 189 OF 1240 *** + // Wavefunction(s) for diagram number 189 + // (none) + // Amplitude(s) for diagram number 189 + FFV1_0( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 189 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram190( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 190 OF 1240 *** + // Wavefunction(s) for diagram number 190 + FFV1_2( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] ); + // Amplitude(s) for diagram number 190 + FFV1_0( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 190 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram191( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 191 OF 1240 *** + // Wavefunction(s) for diagram number 191 + // (none) + // Amplitude(s) for diagram number 191 + FFV1_0( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 191 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram192( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 192 OF 1240 *** + // Wavefunction(s) for diagram number 192 + // (none) + // Amplitude(s) for diagram number 192 + FFV1_0( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 192 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram193( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 193 OF 1240 *** + // Wavefunction(s) for diagram number 193 + // (none) + // Amplitude(s) for diagram number 193 + FFV1_0( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 193 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram194( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 194 OF 1240 *** + // Wavefunction(s) for diagram number 194 + // (none) + // Amplitude(s) for diagram number 194 + FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 194 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram195( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 195 OF 1240 *** + // Wavefunction(s) for diagram number 195 + // (none) + // Amplitude(s) for diagram number 195 + VVV1_0( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 195 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram196( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 196 OF 1240 *** + // Wavefunction(s) for diagram number 196 + // (none) + // Amplitude(s) for diagram number 196 + FFV1_0( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 196 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram197( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 197 OF 1240 *** + // Wavefunction(s) for diagram number 197 + // (none) + // Amplitude(s) for diagram number 197 + FFV1_0( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 197 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram198( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 198 OF 1240 *** + // Wavefunction(s) for diagram number 198 + // (none) + // Amplitude(s) for diagram number 198 + FFV1_0( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 198 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram199( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 199 OF 1240 *** + // Wavefunction(s) for diagram number 199 + FFV1_2( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] ); + // Amplitude(s) for diagram number 199 + FFV1_0( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 199 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram200( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 200 OF 1240 *** + // Wavefunction(s) for diagram number 200 + // (none) + // Amplitude(s) for diagram number 200 + FFV1_0( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 200 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram201( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 201 OF 1240 *** + // Wavefunction(s) for diagram number 201 + // (none) + // Amplitude(s) for diagram number 201 + FFV1_0( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 201 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram202( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 202 OF 1240 *** + // Wavefunction(s) for diagram number 202 + // (none) + // Amplitude(s) for diagram number 202 + FFV1_0( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 202 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram203( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 203 OF 1240 *** + // Wavefunction(s) for diagram number 203 + // (none) + // Amplitude(s) for diagram number 203 + FFV1_0( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 203 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram204( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 204 OF 1240 *** + // Wavefunction(s) for diagram number 204 + // (none) + // Amplitude(s) for diagram number 204 + VVV1_0( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 204 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram205( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 205 OF 1240 *** + // Wavefunction(s) for diagram number 205 + // (none) + // Amplitude(s) for diagram number 205 + FFV1_0( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 205 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram206( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 206 OF 1240 *** + // Wavefunction(s) for diagram number 206 + // (none) + // Amplitude(s) for diagram number 206 + FFV1_0( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 206 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram207( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 207 OF 1240 *** + // Wavefunction(s) for diagram number 207 + // (none) + // Amplitude(s) for diagram number 207 + FFV1_0( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 207 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram208( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 208 OF 1240 *** + // Wavefunction(s) for diagram number 208 + FFV1_2( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); + // Amplitude(s) for diagram number 208 + FFV1_0( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 208 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram209( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 209 OF 1240 *** + // Wavefunction(s) for diagram number 209 + // (none) + // Amplitude(s) for diagram number 209 + FFV1_0( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 209 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram210( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 210 OF 1240 *** + // Wavefunction(s) for diagram number 210 + // (none) + // Amplitude(s) for diagram number 210 + FFV1_0( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 210 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram211( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 211 OF 1240 *** + // Wavefunction(s) for diagram number 211 + // (none) + // Amplitude(s) for diagram number 211 + FFV1_0( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 211 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram212( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 212 OF 1240 *** + // Wavefunction(s) for diagram number 212 + // (none) + // Amplitude(s) for diagram number 212 + FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 212 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram213( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 213 OF 1240 *** + // Wavefunction(s) for diagram number 213 + // (none) + // Amplitude(s) for diagram number 213 + VVV1_0( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 213 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram214( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 214 OF 1240 *** + // Wavefunction(s) for diagram number 214 + // (none) + // Amplitude(s) for diagram number 214 + FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 214 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram215( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 215 OF 1240 *** + // Wavefunction(s) for diagram number 215 + // (none) + // Amplitude(s) for diagram number 215 + FFV1_0( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 215 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram216( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 216 OF 1240 *** + // Wavefunction(s) for diagram number 216 + // (none) + // Amplitude(s) for diagram number 216 + FFV1_0( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 216 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram217( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 217 OF 1240 *** + // Wavefunction(s) for diagram number 217 + VVV1P0_1( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] ); + // Amplitude(s) for diagram number 217 + VVV1_0( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 217 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram218( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 218 OF 1240 *** + // Wavefunction(s) for diagram number 218 + // (none) + // Amplitude(s) for diagram number 218 + VVV1_0( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 218 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram219( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 219 OF 1240 *** + // Wavefunction(s) for diagram number 219 + // (none) + // Amplitude(s) for diagram number 219 + VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram220( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 220 OF 1240 *** + // Wavefunction(s) for diagram number 220 + // (none) + // Amplitude(s) for diagram number 220 + FFV1_0( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 220 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram221( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 221 OF 1240 *** + // Wavefunction(s) for diagram number 221 + // (none) + // Amplitude(s) for diagram number 221 + FFV1_0( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 221 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram222( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 222 OF 1240 *** + // Wavefunction(s) for diagram number 222 + // (none) + // Amplitude(s) for diagram number 222 + FFV1_0( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 222 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram223( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 223 OF 1240 *** + // Wavefunction(s) for diagram number 223 + // (none) + // Amplitude(s) for diagram number 223 + FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 223 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram224( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 224 OF 1240 *** + // Wavefunction(s) for diagram number 224 + VVV1P0_1( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] ); + // Amplitude(s) for diagram number 224 + VVV1_0( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 224 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram225( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 225 OF 1240 *** + // Wavefunction(s) for diagram number 225 + // (none) + // Amplitude(s) for diagram number 225 + VVV1_0( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 225 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram226( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 226 OF 1240 *** + // Wavefunction(s) for diagram number 226 + // (none) + // Amplitude(s) for diagram number 226 + VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram227( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 227 OF 1240 *** + // Wavefunction(s) for diagram number 227 + // (none) + // Amplitude(s) for diagram number 227 + FFV1_0( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 227 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram228( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 228 OF 1240 *** + // Wavefunction(s) for diagram number 228 + // (none) + // Amplitude(s) for diagram number 228 + FFV1_0( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 228 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram229( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 229 OF 1240 *** + // Wavefunction(s) for diagram number 229 + // (none) + // Amplitude(s) for diagram number 229 + FFV1_0( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 229 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram230( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 230 OF 1240 *** + // Wavefunction(s) for diagram number 230 + // (none) + // Amplitude(s) for diagram number 230 + FFV1_0( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 230 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram231( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 231 OF 1240 *** + // Wavefunction(s) for diagram number 231 + VVV1P0_1( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] ); + // Amplitude(s) for diagram number 231 + VVV1_0( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 231 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram232( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 232 OF 1240 *** + // Wavefunction(s) for diagram number 232 + // (none) + // Amplitude(s) for diagram number 232 + VVV1_0( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 232 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram233( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 233 OF 1240 *** + // Wavefunction(s) for diagram number 233 + // (none) + // Amplitude(s) for diagram number 233 + VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram234( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 234 OF 1240 *** + // Wavefunction(s) for diagram number 234 + // (none) + // Amplitude(s) for diagram number 234 + FFV1_0( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 234 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram235( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 235 OF 1240 *** + // Wavefunction(s) for diagram number 235 + // (none) + // Amplitude(s) for diagram number 235 + FFV1_0( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 235 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram236( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 236 OF 1240 *** + // Wavefunction(s) for diagram number 236 + VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] ); + VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] ); + VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] ); + // Amplitude(s) for diagram number 236 + VVV1_0( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram237( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 237 OF 1240 *** + // Wavefunction(s) for diagram number 237 + // (none) + // Amplitude(s) for diagram number 237 + FFV1_0( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram238( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 238 OF 1240 *** + // Wavefunction(s) for diagram number 238 + // (none) + // Amplitude(s) for diagram number 238 + FFV1_0( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram239( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 239 OF 1240 *** + // Wavefunction(s) for diagram number 239 + VVVV1P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] ); + VVVV3P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] ); + VVVV4P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] ); + // Amplitude(s) for diagram number 239 + VVV1_0( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram240( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 240 OF 1240 *** + // Wavefunction(s) for diagram number 240 + // (none) + // Amplitude(s) for diagram number 240 + FFV1_0( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram241( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 241 OF 1240 *** + // Wavefunction(s) for diagram number 241 + // (none) + // Amplitude(s) for diagram number 241 + FFV1_0( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram242( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 242 OF 1240 *** + // Wavefunction(s) for diagram number 242 + VVVV1P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] ); + VVVV3P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] ); + VVVV4P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] ); + // Amplitude(s) for diagram number 242 + VVV1_0( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram243( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 243 OF 1240 *** + // Wavefunction(s) for diagram number 243 + // (none) + // Amplitude(s) for diagram number 243 + FFV1_0( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram244( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 244 OF 1240 *** + // Wavefunction(s) for diagram number 244 + // (none) + // Amplitude(s) for diagram number 244 + FFV1_0( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram245( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 245 OF 1240 *** + // Wavefunction(s) for diagram number 245 + // (none) + // Amplitude(s) for diagram number 245 + FFV1_0( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram246( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 246 OF 1240 *** + // Wavefunction(s) for diagram number 246 + // (none) + // Amplitude(s) for diagram number 246 + VVV1_0( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram247( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 247 OF 1240 *** + // Wavefunction(s) for diagram number 247 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); + FFV1_2( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); + FFV1_1( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 247 + FFV1_0( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 247 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram248( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 248 OF 1240 *** + // Wavefunction(s) for diagram number 248 + FFV1_1( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] ); + // Amplitude(s) for diagram number 248 + FFV1_0( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 248 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram249( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 249 OF 1240 *** + // Wavefunction(s) for diagram number 249 + FFV1_2( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] ); + FFV1_1( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] ); + // Amplitude(s) for diagram number 249 + FFV1_0( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 249 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram250( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 250 OF 1240 *** + // Wavefunction(s) for diagram number 250 + // (none) + // Amplitude(s) for diagram number 250 + FFV1_0( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 250 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram251( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 251 OF 1240 *** + // Wavefunction(s) for diagram number 251 + FFV1_2( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] ); + // Amplitude(s) for diagram number 251 + FFV1_0( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 251 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram252( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 252 OF 1240 *** + // Wavefunction(s) for diagram number 252 + // (none) + // Amplitude(s) for diagram number 252 + FFV1_0( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 252 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram253( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 253 OF 1240 *** + // Wavefunction(s) for diagram number 253 + FFV1P0_3( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] ); + // Amplitude(s) for diagram number 253 + VVV1_0( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 253 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram254( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 254 OF 1240 *** + // Wavefunction(s) for diagram number 254 + FFV1_2( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); + // Amplitude(s) for diagram number 254 + FFV1_0( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 254 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram255( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 255 OF 1240 *** + // Wavefunction(s) for diagram number 255 + // (none) + // Amplitude(s) for diagram number 255 + FFV1_0( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 255 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram256( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 256 OF 1240 *** + // Wavefunction(s) for diagram number 256 + // (none) + // Amplitude(s) for diagram number 256 + VVV1_0( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 256 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram257( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 257 OF 1240 *** + // Wavefunction(s) for diagram number 257 + FFV1_2( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] ); + // Amplitude(s) for diagram number 257 + FFV1_0( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 257 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram258( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 258 OF 1240 *** + // Wavefunction(s) for diagram number 258 + // (none) + // Amplitude(s) for diagram number 258 + FFV1_0( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 258 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram259( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 259 OF 1240 *** + // Wavefunction(s) for diagram number 259 + // (none) + // Amplitude(s) for diagram number 259 + VVV1_0( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 259 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram260( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 260 OF 1240 *** + // Wavefunction(s) for diagram number 260 + // (none) + // Amplitude(s) for diagram number 260 + FFV1_0( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 260 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram261( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 261 OF 1240 *** + // Wavefunction(s) for diagram number 261 + FFV1_2( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] ); + // Amplitude(s) for diagram number 261 + FFV1_0( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 261 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram262( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 262 OF 1240 *** + // Wavefunction(s) for diagram number 262 + // (none) + // Amplitude(s) for diagram number 262 + FFV1_0( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram263( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 263 OF 1240 *** + // Wavefunction(s) for diagram number 263 + FFV1P0_3( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] ); + // Amplitude(s) for diagram number 263 + VVV1_0( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 263 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram264( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 264 OF 1240 *** + // Wavefunction(s) for diagram number 264 + // (none) + // Amplitude(s) for diagram number 264 + VVV1_0( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 264 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram265( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 265 OF 1240 *** + // Wavefunction(s) for diagram number 265 + // (none) + // Amplitude(s) for diagram number 265 + VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram266( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 266 OF 1240 *** + // Wavefunction(s) for diagram number 266 + FFV1_1( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] ); + // Amplitude(s) for diagram number 266 + FFV1_0( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 266 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram267( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 267 OF 1240 *** + // Wavefunction(s) for diagram number 267 + // (none) + // Amplitude(s) for diagram number 267 + FFV1_0( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 267 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram268( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 268 OF 1240 *** + // Wavefunction(s) for diagram number 268 + // (none) + // Amplitude(s) for diagram number 268 + FFV1_0( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 268 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram269( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 269 OF 1240 *** + // Wavefunction(s) for diagram number 269 + // (none) + // Amplitude(s) for diagram number 269 + FFV1_0( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 269 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram270( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 270 OF 1240 *** + // Wavefunction(s) for diagram number 270 + FFV1_2( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] ); + // Amplitude(s) for diagram number 270 + FFV1_0( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 270 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram271( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 271 OF 1240 *** + // Wavefunction(s) for diagram number 271 + FFV1P0_3( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] ); + // Amplitude(s) for diagram number 271 + VVV1_0( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 271 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram272( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 272 OF 1240 *** + // Wavefunction(s) for diagram number 272 + // (none) + // Amplitude(s) for diagram number 272 + FFV1_0( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 272 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram273( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 273 OF 1240 *** + // Wavefunction(s) for diagram number 273 + // (none) + // Amplitude(s) for diagram number 273 + FFV1_0( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 273 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram274( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 274 OF 1240 *** + // Wavefunction(s) for diagram number 274 + FFV1P0_3( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] ); + // Amplitude(s) for diagram number 274 + VVV1_0( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 274 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram275( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 275 OF 1240 *** + // Wavefunction(s) for diagram number 275 + // (none) + // Amplitude(s) for diagram number 275 + FFV1_0( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 275 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram276( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 276 OF 1240 *** + // Wavefunction(s) for diagram number 276 + // (none) + // Amplitude(s) for diagram number 276 + FFV1_0( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 276 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram277( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 277 OF 1240 *** + // Wavefunction(s) for diagram number 277 + // (none) + // Amplitude(s) for diagram number 277 + VVV1_0( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 277 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram278( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 278 OF 1240 *** + // Wavefunction(s) for diagram number 278 + // (none) + // Amplitude(s) for diagram number 278 + FFV1_0( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 278 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram279( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 279 OF 1240 *** + // Wavefunction(s) for diagram number 279 + // (none) + // Amplitude(s) for diagram number 279 + VVV1_0( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 279 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram280( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 280 OF 1240 *** + // Wavefunction(s) for diagram number 280 + // (none) + // Amplitude(s) for diagram number 280 + VVV1_0( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 280 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram281( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 281 OF 1240 *** + // Wavefunction(s) for diagram number 281 + // (none) + // Amplitude(s) for diagram number 281 + VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram282( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 282 OF 1240 *** + // Wavefunction(s) for diagram number 282 + FFV1_1( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] ); + // Amplitude(s) for diagram number 282 + FFV1_0( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 282 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram283( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 283 OF 1240 *** + // Wavefunction(s) for diagram number 283 + // (none) + // Amplitude(s) for diagram number 283 + FFV1_0( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 283 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram284( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 284 OF 1240 *** + // Wavefunction(s) for diagram number 284 + // (none) + // Amplitude(s) for diagram number 284 + FFV1_0( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 284 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram285( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 285 OF 1240 *** + // Wavefunction(s) for diagram number 285 + // (none) + // Amplitude(s) for diagram number 285 + FFV1_0( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 285 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram286( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 286 OF 1240 *** + // Wavefunction(s) for diagram number 286 + FFV1_2( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] ); + // Amplitude(s) for diagram number 286 + FFV1_0( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 286 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram287( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 287 OF 1240 *** + // Wavefunction(s) for diagram number 287 + FFV1P0_3( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] ); + // Amplitude(s) for diagram number 287 + VVV1_0( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 287 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram288( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 288 OF 1240 *** + // Wavefunction(s) for diagram number 288 + // (none) + // Amplitude(s) for diagram number 288 + FFV1_0( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 288 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram289( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 289 OF 1240 *** + // Wavefunction(s) for diagram number 289 + // (none) + // Amplitude(s) for diagram number 289 + FFV1_0( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 289 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram290( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 290 OF 1240 *** + // Wavefunction(s) for diagram number 290 + // (none) + // Amplitude(s) for diagram number 290 + VVV1_0( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 290 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram291( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 291 OF 1240 *** + // Wavefunction(s) for diagram number 291 + // (none) + // Amplitude(s) for diagram number 291 + FFV1_0( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 291 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram292( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 292 OF 1240 *** + // Wavefunction(s) for diagram number 292 + // (none) + // Amplitude(s) for diagram number 292 + FFV1_0( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 292 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram293( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 293 OF 1240 *** + // Wavefunction(s) for diagram number 293 + // (none) + // Amplitude(s) for diagram number 293 + VVV1_0( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 293 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram294( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 294 OF 1240 *** + // Wavefunction(s) for diagram number 294 + // (none) + // Amplitude(s) for diagram number 294 + FFV1_0( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 294 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram295( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 295 OF 1240 *** + // Wavefunction(s) for diagram number 295 + // (none) + // Amplitude(s) for diagram number 295 + VVV1_0( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 295 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram296( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 296 OF 1240 *** + // Wavefunction(s) for diagram number 296 + // (none) + // Amplitude(s) for diagram number 296 + VVV1_0( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 296 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram297( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 297 OF 1240 *** + // Wavefunction(s) for diagram number 297 + // (none) + // Amplitude(s) for diagram number 297 + VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram298( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 298 OF 1240 *** + // Wavefunction(s) for diagram number 298 + FFV1_1( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] ); + // Amplitude(s) for diagram number 298 + FFV1_0( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 298 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram299( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 299 OF 1240 *** + // Wavefunction(s) for diagram number 299 + // (none) + // Amplitude(s) for diagram number 299 + FFV1_0( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 299 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram300( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 300 OF 1240 *** + // Wavefunction(s) for diagram number 300 + // (none) + // Amplitude(s) for diagram number 300 + FFV1_0( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 300 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram301( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 301 OF 1240 *** + // Wavefunction(s) for diagram number 301 + // (none) + // Amplitude(s) for diagram number 301 + FFV1_0( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 301 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram302( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 302 OF 1240 *** + // Wavefunction(s) for diagram number 302 + FFV1_2( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 302 + FFV1_0( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 302 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram303( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 303 OF 1240 *** + // Wavefunction(s) for diagram number 303 + // (none) + // Amplitude(s) for diagram number 303 + VVV1_0( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 303 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram304( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 304 OF 1240 *** + // Wavefunction(s) for diagram number 304 + // (none) + // Amplitude(s) for diagram number 304 + FFV1_0( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 304 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram305( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 305 OF 1240 *** + // Wavefunction(s) for diagram number 305 + // (none) + // Amplitude(s) for diagram number 305 + FFV1_0( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 305 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram306( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 306 OF 1240 *** + // Wavefunction(s) for diagram number 306 + // (none) + // Amplitude(s) for diagram number 306 + VVV1_0( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 306 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram307( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 307 OF 1240 *** + // Wavefunction(s) for diagram number 307 + // (none) + // Amplitude(s) for diagram number 307 + FFV1_0( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 307 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram308( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 308 OF 1240 *** + // Wavefunction(s) for diagram number 308 + // (none) + // Amplitude(s) for diagram number 308 + FFV1_0( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 308 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram309( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 309 OF 1240 *** + // Wavefunction(s) for diagram number 309 + // (none) + // Amplitude(s) for diagram number 309 + VVV1_0( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 309 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram310( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 310 OF 1240 *** + // Wavefunction(s) for diagram number 310 + // (none) + // Amplitude(s) for diagram number 310 + FFV1_0( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 310 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram311( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 311 OF 1240 *** + // Wavefunction(s) for diagram number 311 + FFV1_2( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 311 + FFV1_0( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 311 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram312( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 312 OF 1240 *** + // Wavefunction(s) for diagram number 312 + // (none) + // Amplitude(s) for diagram number 312 + FFV1_0( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 312 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram313( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 313 OF 1240 *** + // Wavefunction(s) for diagram number 313 + FFV1_1( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] ); + // Amplitude(s) for diagram number 313 + FFV1_0( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 313 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram314( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 314 OF 1240 *** + // Wavefunction(s) for diagram number 314 + // (none) + // Amplitude(s) for diagram number 314 + FFV1_0( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 314 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram315( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 315 OF 1240 *** + // Wavefunction(s) for diagram number 315 + // (none) + // Amplitude(s) for diagram number 315 + FFV1_0( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 315 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram316( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 316 OF 1240 *** + // Wavefunction(s) for diagram number 316 + // (none) + // Amplitude(s) for diagram number 316 + FFV1_0( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 316 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram317( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 317 OF 1240 *** + // Wavefunction(s) for diagram number 317 + // (none) + // Amplitude(s) for diagram number 317 + FFV1_0( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 317 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram318( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 318 OF 1240 *** + // Wavefunction(s) for diagram number 318 + // (none) + // Amplitude(s) for diagram number 318 + VVV1_0( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 318 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram319( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 319 OF 1240 *** + // Wavefunction(s) for diagram number 319 + // (none) + // Amplitude(s) for diagram number 319 + FFV1_0( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 319 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram320( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 320 OF 1240 *** + // Wavefunction(s) for diagram number 320 + // (none) + // Amplitude(s) for diagram number 320 + FFV1_0( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 320 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram321( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 321 OF 1240 *** + // Wavefunction(s) for diagram number 321 + // (none) + // Amplitude(s) for diagram number 321 + FFV1_0( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 321 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram322( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 322 OF 1240 *** + // Wavefunction(s) for diagram number 322 + FFV1_1( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] ); + // Amplitude(s) for diagram number 322 + FFV1_0( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 322 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram323( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 323 OF 1240 *** + // Wavefunction(s) for diagram number 323 + // (none) + // Amplitude(s) for diagram number 323 + FFV1_0( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 323 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram324( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 324 OF 1240 *** + // Wavefunction(s) for diagram number 324 + // (none) + // Amplitude(s) for diagram number 324 + FFV1_0( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 324 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram325( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 325 OF 1240 *** + // Wavefunction(s) for diagram number 325 + // (none) + // Amplitude(s) for diagram number 325 + FFV1_0( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 325 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram326( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 326 OF 1240 *** + // Wavefunction(s) for diagram number 326 + // (none) + // Amplitude(s) for diagram number 326 + FFV1_0( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 326 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram327( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 327 OF 1240 *** + // Wavefunction(s) for diagram number 327 + // (none) + // Amplitude(s) for diagram number 327 + VVV1_0( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 327 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram328( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 328 OF 1240 *** + // Wavefunction(s) for diagram number 328 + // (none) + // Amplitude(s) for diagram number 328 + FFV1_0( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 328 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram329( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 329 OF 1240 *** + // Wavefunction(s) for diagram number 329 + // (none) + // Amplitude(s) for diagram number 329 + FFV1_0( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 329 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram330( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 330 OF 1240 *** + // Wavefunction(s) for diagram number 330 + // (none) + // Amplitude(s) for diagram number 330 + FFV1_0( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 330 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram331( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 331 OF 1240 *** + // Wavefunction(s) for diagram number 331 + FFV1_1( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] ); + // Amplitude(s) for diagram number 331 + FFV1_0( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 331 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram332( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 332 OF 1240 *** + // Wavefunction(s) for diagram number 332 + // (none) + // Amplitude(s) for diagram number 332 + FFV1_0( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 332 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram333( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 333 OF 1240 *** + // Wavefunction(s) for diagram number 333 + // (none) + // Amplitude(s) for diagram number 333 + FFV1_0( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 333 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram334( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 334 OF 1240 *** + // Wavefunction(s) for diagram number 334 + // (none) + // Amplitude(s) for diagram number 334 + FFV1_0( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 334 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram335( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 335 OF 1240 *** + // Wavefunction(s) for diagram number 335 + // (none) + // Amplitude(s) for diagram number 335 + FFV1_0( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 335 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram336( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 336 OF 1240 *** + // Wavefunction(s) for diagram number 336 + // (none) + // Amplitude(s) for diagram number 336 + VVV1_0( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 336 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram337( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 337 OF 1240 *** + // Wavefunction(s) for diagram number 337 + // (none) + // Amplitude(s) for diagram number 337 + FFV1_0( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 337 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram338( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 338 OF 1240 *** + // Wavefunction(s) for diagram number 338 + // (none) + // Amplitude(s) for diagram number 338 + FFV1_0( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 338 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram339( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 339 OF 1240 *** + // Wavefunction(s) for diagram number 339 + // (none) + // Amplitude(s) for diagram number 339 + FFV1_0( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 339 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram340( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 340 OF 1240 *** + // Wavefunction(s) for diagram number 340 + // (none) + // Amplitude(s) for diagram number 340 + VVV1_0( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 340 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram341( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 341 OF 1240 *** + // Wavefunction(s) for diagram number 341 + // (none) + // Amplitude(s) for diagram number 341 + VVV1_0( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 341 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram342( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 342 OF 1240 *** + // Wavefunction(s) for diagram number 342 + // (none) + // Amplitude(s) for diagram number 342 + VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram343( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 343 OF 1240 *** + // Wavefunction(s) for diagram number 343 + // (none) + // Amplitude(s) for diagram number 343 + FFV1_0( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 343 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram344( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 344 OF 1240 *** + // Wavefunction(s) for diagram number 344 + // (none) + // Amplitude(s) for diagram number 344 + FFV1_0( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 344 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram345( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 345 OF 1240 *** + // Wavefunction(s) for diagram number 345 + // (none) + // Amplitude(s) for diagram number 345 + FFV1_0( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 345 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram346( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 346 OF 1240 *** + // Wavefunction(s) for diagram number 346 + // (none) + // Amplitude(s) for diagram number 346 + FFV1_0( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 346 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram347( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 347 OF 1240 *** + // Wavefunction(s) for diagram number 347 + // (none) + // Amplitude(s) for diagram number 347 + VVV1_0( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 347 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram348( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 348 OF 1240 *** + // Wavefunction(s) for diagram number 348 + // (none) + // Amplitude(s) for diagram number 348 + VVV1_0( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 348 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram349( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 349 OF 1240 *** + // Wavefunction(s) for diagram number 349 + // (none) + // Amplitude(s) for diagram number 349 + VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram350( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 350 OF 1240 *** + // Wavefunction(s) for diagram number 350 + // (none) + // Amplitude(s) for diagram number 350 + FFV1_0( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 350 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram351( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 351 OF 1240 *** + // Wavefunction(s) for diagram number 351 + // (none) + // Amplitude(s) for diagram number 351 + FFV1_0( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 351 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram352( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 352 OF 1240 *** + // Wavefunction(s) for diagram number 352 + // (none) + // Amplitude(s) for diagram number 352 + FFV1_0( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 352 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram353( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 353 OF 1240 *** + // Wavefunction(s) for diagram number 353 + // (none) + // Amplitude(s) for diagram number 353 + FFV1_0( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 353 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram354( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 354 OF 1240 *** + // Wavefunction(s) for diagram number 354 + // (none) + // Amplitude(s) for diagram number 354 + VVV1_0( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 354 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram355( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 355 OF 1240 *** + // Wavefunction(s) for diagram number 355 + // (none) + // Amplitude(s) for diagram number 355 + VVV1_0( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 355 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram356( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 356 OF 1240 *** + // Wavefunction(s) for diagram number 356 + // (none) + // Amplitude(s) for diagram number 356 + VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram357( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 357 OF 1240 *** + // Wavefunction(s) for diagram number 357 + // (none) + // Amplitude(s) for diagram number 357 + FFV1_0( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 357 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram358( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 358 OF 1240 *** + // Wavefunction(s) for diagram number 358 + // (none) + // Amplitude(s) for diagram number 358 + FFV1_0( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 358 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram359( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 359 OF 1240 *** + // Wavefunction(s) for diagram number 359 + // (none) + // Amplitude(s) for diagram number 359 + VVV1_0( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram360( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 360 OF 1240 *** + // Wavefunction(s) for diagram number 360 + // (none) + // Amplitude(s) for diagram number 360 + FFV1_0( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + FFV1_0( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + FFV1_0( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram361( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 361 OF 1240 *** + // Wavefunction(s) for diagram number 361 + // (none) + // Amplitude(s) for diagram number 361 + FFV1_0( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram362( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 362 OF 1240 *** + // Wavefunction(s) for diagram number 362 + // (none) + // Amplitude(s) for diagram number 362 + VVV1_0( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram363( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 363 OF 1240 *** + // Wavefunction(s) for diagram number 363 + // (none) + // Amplitude(s) for diagram number 363 + FFV1_0( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + FFV1_0( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + FFV1_0( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram364( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 364 OF 1240 *** + // Wavefunction(s) for diagram number 364 + // (none) + // Amplitude(s) for diagram number 364 + FFV1_0( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram365( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 365 OF 1240 *** + // Wavefunction(s) for diagram number 365 + // (none) + // Amplitude(s) for diagram number 365 + VVV1_0( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram366( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 366 OF 1240 *** + // Wavefunction(s) for diagram number 366 + // (none) + // Amplitude(s) for diagram number 366 + FFV1_0( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + FFV1_0( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + FFV1_0( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram367( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 367 OF 1240 *** + // Wavefunction(s) for diagram number 367 + // (none) + // Amplitude(s) for diagram number 367 + FFV1_0( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram368( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 368 OF 1240 *** + // Wavefunction(s) for diagram number 368 + // (none) + // Amplitude(s) for diagram number 368 + FFV1_0( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram369( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 369 OF 1240 *** + // Wavefunction(s) for diagram number 369 + // (none) + // Amplitude(s) for diagram number 369 + VVV1_0( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram370( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 370 OF 1240 *** + // Wavefunction(s) for diagram number 370 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] ); + FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 370 + FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 370 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram371( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 371 OF 1240 *** + // Wavefunction(s) for diagram number 371 + // (none) + // Amplitude(s) for diagram number 371 + FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 371 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram372( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 372 OF 1240 *** + // Wavefunction(s) for diagram number 372 + VVV1P0_1( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] ); + FFV1P0_3( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] ); + // Amplitude(s) for diagram number 372 + VVV1_0( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 372 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram373( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 373 OF 1240 *** + // Wavefunction(s) for diagram number 373 + // (none) + // Amplitude(s) for diagram number 373 + FFV1_0( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 373 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram374( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 374 OF 1240 *** + // Wavefunction(s) for diagram number 374 + VVV1P0_1( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 374 + VVV1_0( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 374 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram375( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 375 OF 1240 *** + // Wavefunction(s) for diagram number 375 + // (none) + // Amplitude(s) for diagram number 375 + FFV1_0( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 375 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram376( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 376 OF 1240 *** + // Wavefunction(s) for diagram number 376 + VVVV1P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] ); + VVVV3P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] ); + VVVV4P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] ); + // Amplitude(s) for diagram number 376 + FFV1_0( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram377( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 377 OF 1240 *** + // Wavefunction(s) for diagram number 377 + FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] ); + // Amplitude(s) for diagram number 377 + FFV1_0( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 377 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram378( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 378 OF 1240 *** + // Wavefunction(s) for diagram number 378 + FFV1_2( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + // Amplitude(s) for diagram number 378 + FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 378 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram379( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 379 OF 1240 *** + // Wavefunction(s) for diagram number 379 + // (none) + // Amplitude(s) for diagram number 379 + FFV1_0( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 379 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram380( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 380 OF 1240 *** + // Wavefunction(s) for diagram number 380 + // (none) + // Amplitude(s) for diagram number 380 + FFV1_0( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 380 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram381( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 381 OF 1240 *** + // Wavefunction(s) for diagram number 381 + FFV1_2( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] ); + // Amplitude(s) for diagram number 381 + FFV1_0( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 381 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram382( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 382 OF 1240 *** + // Wavefunction(s) for diagram number 382 + // (none) + // Amplitude(s) for diagram number 382 + FFV1_0( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 382 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram383( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 383 OF 1240 *** + // Wavefunction(s) for diagram number 383 + // (none) + // Amplitude(s) for diagram number 383 + FFV1_0( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 383 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram384( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 384 OF 1240 *** + // Wavefunction(s) for diagram number 384 + // (none) + // Amplitude(s) for diagram number 384 + FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 384 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram385( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 385 OF 1240 *** + // Wavefunction(s) for diagram number 385 + VVV1P0_1( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] ); + // Amplitude(s) for diagram number 385 + FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 385 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram386( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 386 OF 1240 *** + // Wavefunction(s) for diagram number 386 + FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); + // Amplitude(s) for diagram number 386 + FFV1_0( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 386 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram387( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 387 OF 1240 *** + // Wavefunction(s) for diagram number 387 + // (none) + // Amplitude(s) for diagram number 387 + FFV1_0( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 387 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram388( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 388 OF 1240 *** + // Wavefunction(s) for diagram number 388 + FFV1P0_3( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] ); + // Amplitude(s) for diagram number 388 + VVV1_0( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 388 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram389( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 389 OF 1240 *** + // Wavefunction(s) for diagram number 389 + // (none) + // Amplitude(s) for diagram number 389 + FFV1_0( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 389 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram390( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 390 OF 1240 *** + // Wavefunction(s) for diagram number 390 + // (none) + // Amplitude(s) for diagram number 390 + VVV1_0( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 390 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram391( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 391 OF 1240 *** + // Wavefunction(s) for diagram number 391 + // (none) + // Amplitude(s) for diagram number 391 + FFV1_0( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 391 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram392( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 392 OF 1240 *** + // Wavefunction(s) for diagram number 392 + // (none) + // Amplitude(s) for diagram number 392 + FFV1_0( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram393( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 393 OF 1240 *** + // Wavefunction(s) for diagram number 393 + FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); + // Amplitude(s) for diagram number 393 + FFV1_0( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 393 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram394( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 394 OF 1240 *** + // Wavefunction(s) for diagram number 394 + FFV1_1( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] ); + // Amplitude(s) for diagram number 394 + FFV1_0( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 394 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram395( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 395 OF 1240 *** + // Wavefunction(s) for diagram number 395 + // (none) + // Amplitude(s) for diagram number 395 + FFV1_0( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 395 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram396( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 396 OF 1240 *** + // Wavefunction(s) for diagram number 396 + // (none) + // Amplitude(s) for diagram number 396 + FFV1_0( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 396 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram397( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 397 OF 1240 *** + // Wavefunction(s) for diagram number 397 + FFV1_1( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); + // Amplitude(s) for diagram number 397 + FFV1_0( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 397 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram398( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 398 OF 1240 *** + // Wavefunction(s) for diagram number 398 + // (none) + // Amplitude(s) for diagram number 398 + FFV1_0( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 398 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram399( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 399 OF 1240 *** + // Wavefunction(s) for diagram number 399 + // (none) + // Amplitude(s) for diagram number 399 + FFV1_0( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 399 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram400( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 400 OF 1240 *** + // Wavefunction(s) for diagram number 400 + // (none) + // Amplitude(s) for diagram number 400 + FFV1_0( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 400 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram401( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 401 OF 1240 *** + // Wavefunction(s) for diagram number 401 + // (none) + // Amplitude(s) for diagram number 401 + FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 401 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram402( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 402 OF 1240 *** + // Wavefunction(s) for diagram number 402 + // (none) + // Amplitude(s) for diagram number 402 + FFV1_0( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 402 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram403( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 403 OF 1240 *** + // Wavefunction(s) for diagram number 403 + // (none) + // Amplitude(s) for diagram number 403 + FFV1_0( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 403 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram404( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 404 OF 1240 *** + // Wavefunction(s) for diagram number 404 + // (none) + // Amplitude(s) for diagram number 404 + FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 404 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram405( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 405 OF 1240 *** + // Wavefunction(s) for diagram number 405 + // (none) + // Amplitude(s) for diagram number 405 + FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 405 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram406( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 406 OF 1240 *** + // Wavefunction(s) for diagram number 406 + // (none) + // Amplitude(s) for diagram number 406 + FFV1_0( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 406 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram407( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 407 OF 1240 *** + // Wavefunction(s) for diagram number 407 + // (none) + // Amplitude(s) for diagram number 407 + FFV1_0( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 407 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram408( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 408 OF 1240 *** + // Wavefunction(s) for diagram number 408 + // (none) + // Amplitude(s) for diagram number 408 + VVVV1_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram409( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 409 OF 1240 *** + // Wavefunction(s) for diagram number 409 + VVV1P0_1( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 409 + VVV1_0( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 409 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram410( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 410 OF 1240 *** + // Wavefunction(s) for diagram number 410 + VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] ); + // Amplitude(s) for diagram number 410 + VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 410 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram411( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 411 OF 1240 *** + // Wavefunction(s) for diagram number 411 + // (none) + // Amplitude(s) for diagram number 411 + VVV1_0( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 411 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram412( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 412 OF 1240 *** + // Wavefunction(s) for diagram number 412 + // (none) + // Amplitude(s) for diagram number 412 + FFV1_0( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 412 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram413( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 413 OF 1240 *** + // Wavefunction(s) for diagram number 413 + // (none) + // Amplitude(s) for diagram number 413 + FFV1_0( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 413 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram414( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 414 OF 1240 *** + // Wavefunction(s) for diagram number 414 + // (none) + // Amplitude(s) for diagram number 414 + FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 414 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram415( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 415 OF 1240 *** + // Wavefunction(s) for diagram number 415 + // (none) + // Amplitude(s) for diagram number 415 + FFV1_0( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 415 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram416( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 416 OF 1240 *** + // Wavefunction(s) for diagram number 416 + // (none) + // Amplitude(s) for diagram number 416 + FFV1_0( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 416 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram417( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 417 OF 1240 *** + // Wavefunction(s) for diagram number 417 + // (none) + // Amplitude(s) for diagram number 417 + FFV1_0( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 417 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram418( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 418 OF 1240 *** + // Wavefunction(s) for diagram number 418 + // (none) + // Amplitude(s) for diagram number 418 + FFV1_0( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 418 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram419( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 419 OF 1240 *** + // Wavefunction(s) for diagram number 419 + // (none) + // Amplitude(s) for diagram number 419 + FFV1_0( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 419 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram420( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 420 OF 1240 *** + // Wavefunction(s) for diagram number 420 + // (none) + // Amplitude(s) for diagram number 420 + FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 420 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram421( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 421 OF 1240 *** + // Wavefunction(s) for diagram number 421 + // (none) + // Amplitude(s) for diagram number 421 + FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 421 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram422( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 422 OF 1240 *** + // Wavefunction(s) for diagram number 422 + // (none) + // Amplitude(s) for diagram number 422 + FFV1_0( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 422 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram423( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 423 OF 1240 *** + // Wavefunction(s) for diagram number 423 + // (none) + // Amplitude(s) for diagram number 423 + FFV1_0( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 423 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram424( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 424 OF 1240 *** + // Wavefunction(s) for diagram number 424 + // (none) + // Amplitude(s) for diagram number 424 + VVVV1_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram425( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 425 OF 1240 *** + // Wavefunction(s) for diagram number 425 + VVV1P0_1( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 425 + VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 425 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram426( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 426 OF 1240 *** + // Wavefunction(s) for diagram number 426 + // (none) + // Amplitude(s) for diagram number 426 + VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 426 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram427( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 427 OF 1240 *** + // Wavefunction(s) for diagram number 427 + // (none) + // Amplitude(s) for diagram number 427 + VVV1_0( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 427 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram428( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 428 OF 1240 *** + // Wavefunction(s) for diagram number 428 + // (none) + // Amplitude(s) for diagram number 428 + FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 428 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram429( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 429 OF 1240 *** + // Wavefunction(s) for diagram number 429 + // (none) + // Amplitude(s) for diagram number 429 + FFV1_0( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 429 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram430( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 430 OF 1240 *** + // Wavefunction(s) for diagram number 430 + // (none) + // Amplitude(s) for diagram number 430 + FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 430 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram431( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 431 OF 1240 *** + // Wavefunction(s) for diagram number 431 + // (none) + // Amplitude(s) for diagram number 431 + FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 431 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram432( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 432 OF 1240 *** + // Wavefunction(s) for diagram number 432 + // (none) + // Amplitude(s) for diagram number 432 + FFV1_0( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 432 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram433( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 433 OF 1240 *** + // Wavefunction(s) for diagram number 433 + // (none) + // Amplitude(s) for diagram number 433 + FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 433 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram434( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 434 OF 1240 *** + // Wavefunction(s) for diagram number 434 + VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 434 + VVV1_0( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 434 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram435( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 435 OF 1240 *** + // Wavefunction(s) for diagram number 435 + // (none) + // Amplitude(s) for diagram number 435 + VVV1_0( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 435 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram436( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 436 OF 1240 *** + // Wavefunction(s) for diagram number 436 + // (none) + // Amplitude(s) for diagram number 436 + VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram437( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 437 OF 1240 *** + // Wavefunction(s) for diagram number 437 + VVV1P0_1( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] ); + // Amplitude(s) for diagram number 437 + VVV1_0( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 437 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram438( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 438 OF 1240 *** + // Wavefunction(s) for diagram number 438 + // (none) + // Amplitude(s) for diagram number 438 + VVV1_0( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 438 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram439( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 439 OF 1240 *** + // Wavefunction(s) for diagram number 439 + // (none) + // Amplitude(s) for diagram number 439 + VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram440( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 440 OF 1240 *** + // Wavefunction(s) for diagram number 440 + // (none) + // Amplitude(s) for diagram number 440 + VVV1_0( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 440 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram441( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 441 OF 1240 *** + // Wavefunction(s) for diagram number 441 + // (none) + // Amplitude(s) for diagram number 441 + VVV1_0( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 441 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram442( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 442 OF 1240 *** + // Wavefunction(s) for diagram number 442 + // (none) + // Amplitude(s) for diagram number 442 + VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram443( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 443 OF 1240 *** + // Wavefunction(s) for diagram number 443 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 443 + VVV1_0( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram444( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 444 OF 1240 *** + // Wavefunction(s) for diagram number 444 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] ); + // Amplitude(s) for diagram number 444 + VVV1_0( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram445( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 445 OF 1240 *** + // Wavefunction(s) for diagram number 445 + // (none) + // Amplitude(s) for diagram number 445 + VVV1_0( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram446( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 446 OF 1240 *** + // Wavefunction(s) for diagram number 446 + // (none) + // Amplitude(s) for diagram number 446 + VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram447( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 447 OF 1240 *** + // Wavefunction(s) for diagram number 447 + // (none) + // Amplitude(s) for diagram number 447 + VVV1_0( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 447 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram448( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 448 OF 1240 *** + // Wavefunction(s) for diagram number 448 + // (none) + // Amplitude(s) for diagram number 448 + VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 448 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram449( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 449 OF 1240 *** + // Wavefunction(s) for diagram number 449 + // (none) + // Amplitude(s) for diagram number 449 + VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 449 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram450( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 450 OF 1240 *** + // Wavefunction(s) for diagram number 450 + // (none) + // Amplitude(s) for diagram number 450 + VVV1_0( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 450 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram451( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 451 OF 1240 *** + // Wavefunction(s) for diagram number 451 + // (none) + // Amplitude(s) for diagram number 451 + FFV1_0( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 451 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram452( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 452 OF 1240 *** + // Wavefunction(s) for diagram number 452 + // (none) + // Amplitude(s) for diagram number 452 + FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 452 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram453( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 453 OF 1240 *** + // Wavefunction(s) for diagram number 453 + // (none) + // Amplitude(s) for diagram number 453 + FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 453 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram454( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 454 OF 1240 *** + // Wavefunction(s) for diagram number 454 + // (none) + // Amplitude(s) for diagram number 454 + FFV1_0( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 454 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram455( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 455 OF 1240 *** + // Wavefunction(s) for diagram number 455 + // (none) + // Amplitude(s) for diagram number 455 + VVV1_0( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 455 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram456( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 456 OF 1240 *** + // Wavefunction(s) for diagram number 456 + // (none) + // Amplitude(s) for diagram number 456 + FFV1_0( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram457( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 457 OF 1240 *** + // Wavefunction(s) for diagram number 457 + // (none) + // Amplitude(s) for diagram number 457 + FFV1_0( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 457 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram458( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 458 OF 1240 *** + // Wavefunction(s) for diagram number 458 + // (none) + // Amplitude(s) for diagram number 458 + FFV1_0( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 458 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram459( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 459 OF 1240 *** + // Wavefunction(s) for diagram number 459 + // (none) + // Amplitude(s) for diagram number 459 + FFV1_0( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 459 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram460( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 460 OF 1240 *** + // Wavefunction(s) for diagram number 460 + // (none) + // Amplitude(s) for diagram number 460 + VVV1_0( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 460 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram461( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 461 OF 1240 *** + // Wavefunction(s) for diagram number 461 + // (none) + // Amplitude(s) for diagram number 461 + FFV1_0( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 461 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram462( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 462 OF 1240 *** + // Wavefunction(s) for diagram number 462 + // (none) + // Amplitude(s) for diagram number 462 + FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 462 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram463( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 463 OF 1240 *** + // Wavefunction(s) for diagram number 463 + // (none) + // Amplitude(s) for diagram number 463 + FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 463 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram464( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 464 OF 1240 *** + // Wavefunction(s) for diagram number 464 + // (none) + // Amplitude(s) for diagram number 464 + FFV1_0( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 464 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram465( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 465 OF 1240 *** + // Wavefunction(s) for diagram number 465 + // (none) + // Amplitude(s) for diagram number 465 + VVV1_0( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 465 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram466( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 466 OF 1240 *** + // Wavefunction(s) for diagram number 466 + // (none) + // Amplitude(s) for diagram number 466 + FFV1_0( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram467( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 467 OF 1240 *** + // Wavefunction(s) for diagram number 467 + // (none) + // Amplitude(s) for diagram number 467 + FFV1_0( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 467 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram468( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 468 OF 1240 *** + // Wavefunction(s) for diagram number 468 + // (none) + // Amplitude(s) for diagram number 468 + FFV1_0( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 468 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram469( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 469 OF 1240 *** + // Wavefunction(s) for diagram number 469 + // (none) + // Amplitude(s) for diagram number 469 + FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 469 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram470( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 470 OF 1240 *** + // Wavefunction(s) for diagram number 470 + // (none) + // Amplitude(s) for diagram number 470 + VVV1_0( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 470 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram471( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 471 OF 1240 *** + // Wavefunction(s) for diagram number 471 + // (none) + // Amplitude(s) for diagram number 471 + FFV1_0( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 471 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram472( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 472 OF 1240 *** + // Wavefunction(s) for diagram number 472 + // (none) + // Amplitude(s) for diagram number 472 + FFV1_0( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 472 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram473( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 473 OF 1240 *** + // Wavefunction(s) for diagram number 473 + // (none) + // Amplitude(s) for diagram number 473 + FFV1_0( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 473 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram474( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 474 OF 1240 *** + // Wavefunction(s) for diagram number 474 + // (none) + // Amplitude(s) for diagram number 474 + FFV1_0( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 474 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram475( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 475 OF 1240 *** + // Wavefunction(s) for diagram number 475 + // (none) + // Amplitude(s) for diagram number 475 + VVV1_0( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 475 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram476( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 476 OF 1240 *** + // Wavefunction(s) for diagram number 476 + // (none) + // Amplitude(s) for diagram number 476 + FFV1_0( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram477( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 477 OF 1240 *** + // Wavefunction(s) for diagram number 477 + // (none) + // Amplitude(s) for diagram number 477 + VVV1_0( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 477 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram478( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 478 OF 1240 *** + // Wavefunction(s) for diagram number 478 + // (none) + // Amplitude(s) for diagram number 478 + FFV1_0( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 478 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram479( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 479 OF 1240 *** + // Wavefunction(s) for diagram number 479 + // (none) + // Amplitude(s) for diagram number 479 + FFV1_0( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 479 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram480( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 480 OF 1240 *** + // Wavefunction(s) for diagram number 480 + // (none) + // Amplitude(s) for diagram number 480 + FFV1_0( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 480 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram481( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 481 OF 1240 *** + // Wavefunction(s) for diagram number 481 + // (none) + // Amplitude(s) for diagram number 481 + FFV1_0( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 481 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram482( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 482 OF 1240 *** + // Wavefunction(s) for diagram number 482 + // (none) + // Amplitude(s) for diagram number 482 + VVV1_0( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 482 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram483( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 483 OF 1240 *** + // Wavefunction(s) for diagram number 483 + // (none) + // Amplitude(s) for diagram number 483 + FFV1_0( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram484( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 484 OF 1240 *** + // Wavefunction(s) for diagram number 484 + // (none) + // Amplitude(s) for diagram number 484 + FFV1_0( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 484 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram485( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 485 OF 1240 *** + // Wavefunction(s) for diagram number 485 + // (none) + // Amplitude(s) for diagram number 485 + FFV1_0( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 485 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram486( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 486 OF 1240 *** + // Wavefunction(s) for diagram number 486 + // (none) + // Amplitude(s) for diagram number 486 + FFV1_0( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 486 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram487( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 487 OF 1240 *** + // Wavefunction(s) for diagram number 487 + // (none) + // Amplitude(s) for diagram number 487 + FFV1_0( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 487 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram488( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 488 OF 1240 *** + // Wavefunction(s) for diagram number 488 + // (none) + // Amplitude(s) for diagram number 488 + FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 488 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram489( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 489 OF 1240 *** + // Wavefunction(s) for diagram number 489 + // (none) + // Amplitude(s) for diagram number 489 + FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 489 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram490( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 490 OF 1240 *** + // Wavefunction(s) for diagram number 490 + // (none) + // Amplitude(s) for diagram number 490 + FFV1_0( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram491( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 491 OF 1240 *** + // Wavefunction(s) for diagram number 491 + // (none) + // Amplitude(s) for diagram number 491 + FFV1_0( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram492( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 492 OF 1240 *** + // Wavefunction(s) for diagram number 492 + // (none) + // Amplitude(s) for diagram number 492 + VVV1_0( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVV1_0( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram493( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 493 OF 1240 *** + // Wavefunction(s) for diagram number 493 + VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] ); + FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 493 + FFV1_0( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 493 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram494( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 494 OF 1240 *** + // Wavefunction(s) for diagram number 494 + // (none) + // Amplitude(s) for diagram number 494 + FFV1_0( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 494 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram495( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 495 OF 1240 *** + // Wavefunction(s) for diagram number 495 + VVV1P0_1( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] ); + // Amplitude(s) for diagram number 495 + VVV1_0( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 495 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram496( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 496 OF 1240 *** + // Wavefunction(s) for diagram number 496 + // (none) + // Amplitude(s) for diagram number 496 + FFV1_0( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 496 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram497( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 497 OF 1240 *** + // Wavefunction(s) for diagram number 497 + VVV1P0_1( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 497 + VVV1_0( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 497 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram498( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 498 OF 1240 *** + // Wavefunction(s) for diagram number 498 + // (none) + // Amplitude(s) for diagram number 498 + FFV1_0( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 498 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram499( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 499 OF 1240 *** + // Wavefunction(s) for diagram number 499 + VVVV1P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); + VVVV3P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] ); + VVVV4P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] ); + // Amplitude(s) for diagram number 499 + FFV1_0( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram500( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 500 OF 1240 *** + // Wavefunction(s) for diagram number 500 + FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); + // Amplitude(s) for diagram number 500 + FFV1_0( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 500 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram501( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 501 OF 1240 *** + // Wavefunction(s) for diagram number 501 + FFV1_2( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); + // Amplitude(s) for diagram number 501 + FFV1_0( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 501 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram502( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 502 OF 1240 *** + // Wavefunction(s) for diagram number 502 + // (none) + // Amplitude(s) for diagram number 502 + FFV1_0( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 502 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram503( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 503 OF 1240 *** + // Wavefunction(s) for diagram number 503 + // (none) + // Amplitude(s) for diagram number 503 + FFV1_0( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 503 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram504( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 504 OF 1240 *** + // Wavefunction(s) for diagram number 504 + FFV1_2( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] ); + // Amplitude(s) for diagram number 504 + FFV1_0( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 504 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram505( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 505 OF 1240 *** + // Wavefunction(s) for diagram number 505 + // (none) + // Amplitude(s) for diagram number 505 + FFV1_0( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 505 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram506( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 506 OF 1240 *** + // Wavefunction(s) for diagram number 506 + // (none) + // Amplitude(s) for diagram number 506 + FFV1_0( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 506 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram507( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 507 OF 1240 *** + // Wavefunction(s) for diagram number 507 + // (none) + // Amplitude(s) for diagram number 507 + FFV1_0( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 507 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram508( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 508 OF 1240 *** + // Wavefunction(s) for diagram number 508 + VVV1P0_1( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] ); + // Amplitude(s) for diagram number 508 + FFV1_0( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 508 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram509( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 509 OF 1240 *** + // Wavefunction(s) for diagram number 509 + FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] ); + // Amplitude(s) for diagram number 509 + FFV1_0( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 509 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram510( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 510 OF 1240 *** + // Wavefunction(s) for diagram number 510 + // (none) + // Amplitude(s) for diagram number 510 + FFV1_0( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 510 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram511( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 511 OF 1240 *** + // Wavefunction(s) for diagram number 511 + // (none) + // Amplitude(s) for diagram number 511 + VVV1_0( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 511 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram512( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 512 OF 1240 *** + // Wavefunction(s) for diagram number 512 + // (none) + // Amplitude(s) for diagram number 512 + FFV1_0( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 512 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram513( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 513 OF 1240 *** + // Wavefunction(s) for diagram number 513 + // (none) + // Amplitude(s) for diagram number 513 + VVV1_0( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 513 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram514( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 514 OF 1240 *** + // Wavefunction(s) for diagram number 514 + // (none) + // Amplitude(s) for diagram number 514 + FFV1_0( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 514 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram515( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 515 OF 1240 *** + // Wavefunction(s) for diagram number 515 + // (none) + // Amplitude(s) for diagram number 515 + FFV1_0( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram516( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 516 OF 1240 *** + // Wavefunction(s) for diagram number 516 + FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] ); + // Amplitude(s) for diagram number 516 + FFV1_0( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 516 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram517( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 517 OF 1240 *** + // Wavefunction(s) for diagram number 517 + FFV1_1( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + // Amplitude(s) for diagram number 517 + FFV1_0( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 517 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram518( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 518 OF 1240 *** + // Wavefunction(s) for diagram number 518 + // (none) + // Amplitude(s) for diagram number 518 + FFV1_0( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 518 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram519( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 519 OF 1240 *** + // Wavefunction(s) for diagram number 519 + // (none) + // Amplitude(s) for diagram number 519 + FFV1_0( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 519 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram520( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 520 OF 1240 *** + // Wavefunction(s) for diagram number 520 + FFV1_1( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); + // Amplitude(s) for diagram number 520 + FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 520 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram521( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 521 OF 1240 *** + // Wavefunction(s) for diagram number 521 + // (none) + // Amplitude(s) for diagram number 521 + FFV1_0( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 521 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram522( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 522 OF 1240 *** + // Wavefunction(s) for diagram number 522 + // (none) + // Amplitude(s) for diagram number 522 + FFV1_0( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 522 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram523( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 523 OF 1240 *** + // Wavefunction(s) for diagram number 523 + // (none) + // Amplitude(s) for diagram number 523 + FFV1_0( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 523 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram524( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 524 OF 1240 *** + // Wavefunction(s) for diagram number 524 + // (none) + // Amplitude(s) for diagram number 524 + FFV1_0( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 524 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram525( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 525 OF 1240 *** + // Wavefunction(s) for diagram number 525 + // (none) + // Amplitude(s) for diagram number 525 + FFV1_0( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 525 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram526( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 526 OF 1240 *** + // Wavefunction(s) for diagram number 526 + // (none) + // Amplitude(s) for diagram number 526 + FFV1_0( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 526 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram527( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 527 OF 1240 *** + // Wavefunction(s) for diagram number 527 + // (none) + // Amplitude(s) for diagram number 527 + FFV1_0( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 527 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram528( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 528 OF 1240 *** + // Wavefunction(s) for diagram number 528 + // (none) + // Amplitude(s) for diagram number 528 + FFV1_0( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 528 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram529( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 529 OF 1240 *** + // Wavefunction(s) for diagram number 529 + // (none) + // Amplitude(s) for diagram number 529 + FFV1_0( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 529 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram530( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 530 OF 1240 *** + // Wavefunction(s) for diagram number 530 + // (none) + // Amplitude(s) for diagram number 530 + FFV1_0( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 530 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram531( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 531 OF 1240 *** + // Wavefunction(s) for diagram number 531 + // (none) + // Amplitude(s) for diagram number 531 + VVVV1_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram532( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 532 OF 1240 *** + // Wavefunction(s) for diagram number 532 + VVV1P0_1( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 532 + VVV1_0( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 532 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram533( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 533 OF 1240 *** + // Wavefunction(s) for diagram number 533 + VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] ); + // Amplitude(s) for diagram number 533 + VVV1_0( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 533 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram534( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 534 OF 1240 *** + // Wavefunction(s) for diagram number 534 + // (none) + // Amplitude(s) for diagram number 534 + VVV1_0( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 534 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram535( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 535 OF 1240 *** + // Wavefunction(s) for diagram number 535 + // (none) + // Amplitude(s) for diagram number 535 + FFV1_0( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 535 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram536( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 536 OF 1240 *** + // Wavefunction(s) for diagram number 536 + // (none) + // Amplitude(s) for diagram number 536 + FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 536 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram537( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 537 OF 1240 *** + // Wavefunction(s) for diagram number 537 + // (none) + // Amplitude(s) for diagram number 537 + FFV1_0( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 537 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram538( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 538 OF 1240 *** + // Wavefunction(s) for diagram number 538 + // (none) + // Amplitude(s) for diagram number 538 + FFV1_0( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 538 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram539( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 539 OF 1240 *** + // Wavefunction(s) for diagram number 539 + // (none) + // Amplitude(s) for diagram number 539 + FFV1_0( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 539 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram540( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 540 OF 1240 *** + // Wavefunction(s) for diagram number 540 + // (none) + // Amplitude(s) for diagram number 540 + FFV1_0( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 540 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram541( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 541 OF 1240 *** + // Wavefunction(s) for diagram number 541 + // (none) + // Amplitude(s) for diagram number 541 + FFV1_0( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 541 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram542( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 542 OF 1240 *** + // Wavefunction(s) for diagram number 542 + // (none) + // Amplitude(s) for diagram number 542 + FFV1_0( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 542 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram543( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 543 OF 1240 *** + // Wavefunction(s) for diagram number 543 + // (none) + // Amplitude(s) for diagram number 543 + FFV1_0( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 543 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram544( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 544 OF 1240 *** + // Wavefunction(s) for diagram number 544 + // (none) + // Amplitude(s) for diagram number 544 + FFV1_0( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 544 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram545( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 545 OF 1240 *** + // Wavefunction(s) for diagram number 545 + // (none) + // Amplitude(s) for diagram number 545 + FFV1_0( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 545 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram546( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 546 OF 1240 *** + // Wavefunction(s) for diagram number 546 + // (none) + // Amplitude(s) for diagram number 546 + FFV1_0( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 546 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram547( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 547 OF 1240 *** + // Wavefunction(s) for diagram number 547 + // (none) + // Amplitude(s) for diagram number 547 + VVVV1_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram548( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 548 OF 1240 *** + // Wavefunction(s) for diagram number 548 + VVV1P0_1( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 548 + VVV1_0( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 548 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram549( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 549 OF 1240 *** + // Wavefunction(s) for diagram number 549 + // (none) + // Amplitude(s) for diagram number 549 + VVV1_0( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 549 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram550( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 550 OF 1240 *** + // Wavefunction(s) for diagram number 550 + // (none) + // Amplitude(s) for diagram number 550 + VVV1_0( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 550 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram551( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 551 OF 1240 *** + // Wavefunction(s) for diagram number 551 + // (none) + // Amplitude(s) for diagram number 551 + FFV1_0( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 551 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram552( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 552 OF 1240 *** + // Wavefunction(s) for diagram number 552 + // (none) + // Amplitude(s) for diagram number 552 + FFV1_0( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 552 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram553( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 553 OF 1240 *** + // Wavefunction(s) for diagram number 553 + // (none) + // Amplitude(s) for diagram number 553 + FFV1_0( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 553 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram554( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 554 OF 1240 *** + // Wavefunction(s) for diagram number 554 + // (none) + // Amplitude(s) for diagram number 554 + FFV1_0( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 554 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram555( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 555 OF 1240 *** + // Wavefunction(s) for diagram number 555 + // (none) + // Amplitude(s) for diagram number 555 + FFV1_0( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 555 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram556( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 556 OF 1240 *** + // Wavefunction(s) for diagram number 556 + // (none) + // Amplitude(s) for diagram number 556 + FFV1_0( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 556 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram557( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 557 OF 1240 *** + // Wavefunction(s) for diagram number 557 + VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 557 + VVV1_0( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 557 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram558( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 558 OF 1240 *** + // Wavefunction(s) for diagram number 558 + // (none) + // Amplitude(s) for diagram number 558 + VVV1_0( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 558 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram559( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 559 OF 1240 *** + // Wavefunction(s) for diagram number 559 + // (none) + // Amplitude(s) for diagram number 559 + VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram560( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 560 OF 1240 *** + // Wavefunction(s) for diagram number 560 + // (none) + // Amplitude(s) for diagram number 560 + VVV1_0( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 560 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram561( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 561 OF 1240 *** + // Wavefunction(s) for diagram number 561 + // (none) + // Amplitude(s) for diagram number 561 + VVV1_0( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 561 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram562( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 562 OF 1240 *** + // Wavefunction(s) for diagram number 562 + // (none) + // Amplitude(s) for diagram number 562 + VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram563( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 563 OF 1240 *** + // Wavefunction(s) for diagram number 563 + // (none) + // Amplitude(s) for diagram number 563 + VVV1_0( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 563 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram564( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 564 OF 1240 *** + // Wavefunction(s) for diagram number 564 + // (none) + // Amplitude(s) for diagram number 564 + VVV1_0( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 564 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram565( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 565 OF 1240 *** + // Wavefunction(s) for diagram number 565 + // (none) + // Amplitude(s) for diagram number 565 + VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram566( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 566 OF 1240 *** + // Wavefunction(s) for diagram number 566 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] ); + // Amplitude(s) for diagram number 566 + VVV1_0( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram567( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 567 OF 1240 *** + // Wavefunction(s) for diagram number 567 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] ); + // Amplitude(s) for diagram number 567 + VVV1_0( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram568( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 568 OF 1240 *** + // Wavefunction(s) for diagram number 568 + // (none) + // Amplitude(s) for diagram number 568 + VVV1_0( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram569( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 569 OF 1240 *** + // Wavefunction(s) for diagram number 569 + // (none) + // Amplitude(s) for diagram number 569 + VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram570( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 570 OF 1240 *** + // Wavefunction(s) for diagram number 570 + // (none) + // Amplitude(s) for diagram number 570 + VVV1_0( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 570 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram571( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 571 OF 1240 *** + // Wavefunction(s) for diagram number 571 + // (none) + // Amplitude(s) for diagram number 571 + VVV1_0( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 571 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram572( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 572 OF 1240 *** + // Wavefunction(s) for diagram number 572 + // (none) + // Amplitude(s) for diagram number 572 + VVV1_0( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 572 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram573( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 573 OF 1240 *** + // Wavefunction(s) for diagram number 573 + // (none) + // Amplitude(s) for diagram number 573 + VVV1_0( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 573 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram574( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 574 OF 1240 *** + // Wavefunction(s) for diagram number 574 + // (none) + // Amplitude(s) for diagram number 574 + FFV1_0( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 574 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram575( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 575 OF 1240 *** + // Wavefunction(s) for diagram number 575 + // (none) + // Amplitude(s) for diagram number 575 + FFV1_0( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 575 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram576( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 576 OF 1240 *** + // Wavefunction(s) for diagram number 576 + // (none) + // Amplitude(s) for diagram number 576 + FFV1_0( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 576 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram577( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 577 OF 1240 *** + // Wavefunction(s) for diagram number 577 + // (none) + // Amplitude(s) for diagram number 577 + FFV1_0( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 577 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram578( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 578 OF 1240 *** + // Wavefunction(s) for diagram number 578 + // (none) + // Amplitude(s) for diagram number 578 + VVV1_0( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 578 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram579( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 579 OF 1240 *** + // Wavefunction(s) for diagram number 579 + // (none) + // Amplitude(s) for diagram number 579 + FFV1_0( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram580( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 580 OF 1240 *** + // Wavefunction(s) for diagram number 580 + // (none) + // Amplitude(s) for diagram number 580 + FFV1_0( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 580 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram581( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 581 OF 1240 *** + // Wavefunction(s) for diagram number 581 + // (none) + // Amplitude(s) for diagram number 581 + FFV1_0( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 581 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram582( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 582 OF 1240 *** + // Wavefunction(s) for diagram number 582 + // (none) + // Amplitude(s) for diagram number 582 + FFV1_0( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 582 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram583( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 583 OF 1240 *** + // Wavefunction(s) for diagram number 583 + // (none) + // Amplitude(s) for diagram number 583 + VVV1_0( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 583 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram584( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 584 OF 1240 *** + // Wavefunction(s) for diagram number 584 + // (none) + // Amplitude(s) for diagram number 584 + FFV1_0( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 584 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram585( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 585 OF 1240 *** + // Wavefunction(s) for diagram number 585 + // (none) + // Amplitude(s) for diagram number 585 + FFV1_0( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 585 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram586( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 586 OF 1240 *** + // Wavefunction(s) for diagram number 586 + // (none) + // Amplitude(s) for diagram number 586 + FFV1_0( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 586 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram587( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 587 OF 1240 *** + // Wavefunction(s) for diagram number 587 + // (none) + // Amplitude(s) for diagram number 587 + FFV1_0( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 587 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram588( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 588 OF 1240 *** + // Wavefunction(s) for diagram number 588 + // (none) + // Amplitude(s) for diagram number 588 + VVV1_0( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 588 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram589( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 589 OF 1240 *** + // Wavefunction(s) for diagram number 589 + // (none) + // Amplitude(s) for diagram number 589 + FFV1_0( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram590( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 590 OF 1240 *** + // Wavefunction(s) for diagram number 590 + // (none) + // Amplitude(s) for diagram number 590 + FFV1_0( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 590 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram591( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 591 OF 1240 *** + // Wavefunction(s) for diagram number 591 + // (none) + // Amplitude(s) for diagram number 591 + FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 591 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram592( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 592 OF 1240 *** + // Wavefunction(s) for diagram number 592 + // (none) + // Amplitude(s) for diagram number 592 + FFV1_0( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 592 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram593( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 593 OF 1240 *** + // Wavefunction(s) for diagram number 593 + // (none) + // Amplitude(s) for diagram number 593 + VVV1_0( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 593 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram594( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 594 OF 1240 *** + // Wavefunction(s) for diagram number 594 + // (none) + // Amplitude(s) for diagram number 594 + FFV1_0( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 594 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram595( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 595 OF 1240 *** + // Wavefunction(s) for diagram number 595 + // (none) + // Amplitude(s) for diagram number 595 + FFV1_0( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 595 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram596( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 596 OF 1240 *** + // Wavefunction(s) for diagram number 596 + // (none) + // Amplitude(s) for diagram number 596 + FFV1_0( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 596 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram597( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 597 OF 1240 *** + // Wavefunction(s) for diagram number 597 + // (none) + // Amplitude(s) for diagram number 597 + FFV1_0( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 597 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram598( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 598 OF 1240 *** + // Wavefunction(s) for diagram number 598 + // (none) + // Amplitude(s) for diagram number 598 + VVV1_0( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 598 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram599( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 599 OF 1240 *** + // Wavefunction(s) for diagram number 599 + // (none) + // Amplitude(s) for diagram number 599 + FFV1_0( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram600( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 600 OF 1240 *** + // Wavefunction(s) for diagram number 600 + // (none) + // Amplitude(s) for diagram number 600 + VVV1_0( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 600 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram601( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 601 OF 1240 *** + // Wavefunction(s) for diagram number 601 + // (none) + // Amplitude(s) for diagram number 601 + FFV1_0( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 601 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram602( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 602 OF 1240 *** + // Wavefunction(s) for diagram number 602 + // (none) + // Amplitude(s) for diagram number 602 + FFV1_0( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 602 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram603( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 603 OF 1240 *** + // Wavefunction(s) for diagram number 603 + // (none) + // Amplitude(s) for diagram number 603 + FFV1_0( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 603 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram604( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 604 OF 1240 *** + // Wavefunction(s) for diagram number 604 + // (none) + // Amplitude(s) for diagram number 604 + FFV1_0( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 604 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram605( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 605 OF 1240 *** + // Wavefunction(s) for diagram number 605 + // (none) + // Amplitude(s) for diagram number 605 + VVV1_0( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 605 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram606( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 606 OF 1240 *** + // Wavefunction(s) for diagram number 606 + // (none) + // Amplitude(s) for diagram number 606 + FFV1_0( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram607( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 607 OF 1240 *** + // Wavefunction(s) for diagram number 607 + // (none) + // Amplitude(s) for diagram number 607 + FFV1_0( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 607 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram608( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 608 OF 1240 *** + // Wavefunction(s) for diagram number 608 + // (none) + // Amplitude(s) for diagram number 608 + FFV1_0( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 608 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram609( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 609 OF 1240 *** + // Wavefunction(s) for diagram number 609 + // (none) + // Amplitude(s) for diagram number 609 + FFV1_0( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 609 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram610( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 610 OF 1240 *** + // Wavefunction(s) for diagram number 610 + // (none) + // Amplitude(s) for diagram number 610 + FFV1_0( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 610 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram611( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 611 OF 1240 *** + // Wavefunction(s) for diagram number 611 + // (none) + // Amplitude(s) for diagram number 611 + FFV1_0( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 611 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram612( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 612 OF 1240 *** + // Wavefunction(s) for diagram number 612 + // (none) + // Amplitude(s) for diagram number 612 + FFV1_0( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 612 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram613( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 613 OF 1240 *** + // Wavefunction(s) for diagram number 613 + // (none) + // Amplitude(s) for diagram number 613 + FFV1_0( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram614( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 614 OF 1240 *** + // Wavefunction(s) for diagram number 614 + // (none) + // Amplitude(s) for diagram number 614 + FFV1_0( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram615( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 615 OF 1240 *** + // Wavefunction(s) for diagram number 615 + // (none) + // Amplitude(s) for diagram number 615 + VVV1_0( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVV1_0( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVV1_0( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram616( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 616 OF 1240 *** + // Wavefunction(s) for diagram number 616 + VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] ); + FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 616 + FFV1_0( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 616 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram617( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 617 OF 1240 *** + // Wavefunction(s) for diagram number 617 + // (none) + // Amplitude(s) for diagram number 617 + FFV1_0( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 617 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram618( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 618 OF 1240 *** + // Wavefunction(s) for diagram number 618 + VVV1P0_1( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] ); + // Amplitude(s) for diagram number 618 + VVV1_0( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 618 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram619( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 619 OF 1240 *** + // Wavefunction(s) for diagram number 619 + // (none) + // Amplitude(s) for diagram number 619 + FFV1_0( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 619 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram620( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 620 OF 1240 *** + // Wavefunction(s) for diagram number 620 + VVV1P0_1( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 620 + VVV1_0( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 620 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram621( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 621 OF 1240 *** + // Wavefunction(s) for diagram number 621 + // (none) + // Amplitude(s) for diagram number 621 + FFV1_0( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 621 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram622( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 622 OF 1240 *** + // Wavefunction(s) for diagram number 622 + VVVV1P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] ); + VVVV3P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] ); + VVVV4P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] ); + // Amplitude(s) for diagram number 622 + FFV1_0( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram623( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 623 OF 1240 *** + // Wavefunction(s) for diagram number 623 + FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); + // Amplitude(s) for diagram number 623 + FFV1_0( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 623 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram624( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 624 OF 1240 *** + // Wavefunction(s) for diagram number 624 + FFV1_2( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] ); + // Amplitude(s) for diagram number 624 + FFV1_0( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 624 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram625( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 625 OF 1240 *** + // Wavefunction(s) for diagram number 625 + // (none) + // Amplitude(s) for diagram number 625 + FFV1_0( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 625 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram626( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 626 OF 1240 *** + // Wavefunction(s) for diagram number 626 + // (none) + // Amplitude(s) for diagram number 626 + FFV1_0( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 626 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram627( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 627 OF 1240 *** + // Wavefunction(s) for diagram number 627 + FFV1_2( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); + // Amplitude(s) for diagram number 627 + FFV1_0( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 627 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram628( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 628 OF 1240 *** + // Wavefunction(s) for diagram number 628 + // (none) + // Amplitude(s) for diagram number 628 + FFV1_0( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 628 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram629( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 629 OF 1240 *** + // Wavefunction(s) for diagram number 629 + // (none) + // Amplitude(s) for diagram number 629 + FFV1_0( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 629 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram630( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 630 OF 1240 *** + // Wavefunction(s) for diagram number 630 + // (none) + // Amplitude(s) for diagram number 630 + FFV1_0( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 630 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram631( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 631 OF 1240 *** + // Wavefunction(s) for diagram number 631 + VVV1P0_1( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] ); + // Amplitude(s) for diagram number 631 + FFV1_0( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 631 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram632( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 632 OF 1240 *** + // Wavefunction(s) for diagram number 632 + FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] ); + // Amplitude(s) for diagram number 632 + FFV1_0( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 632 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram633( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 633 OF 1240 *** + // Wavefunction(s) for diagram number 633 + // (none) + // Amplitude(s) for diagram number 633 + FFV1_0( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 633 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram634( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 634 OF 1240 *** + // Wavefunction(s) for diagram number 634 + // (none) + // Amplitude(s) for diagram number 634 + VVV1_0( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 634 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram635( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 635 OF 1240 *** + // Wavefunction(s) for diagram number 635 + // (none) + // Amplitude(s) for diagram number 635 + FFV1_0( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 635 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram636( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 636 OF 1240 *** + // Wavefunction(s) for diagram number 636 + // (none) + // Amplitude(s) for diagram number 636 + VVV1_0( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 636 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram637( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 637 OF 1240 *** + // Wavefunction(s) for diagram number 637 + // (none) + // Amplitude(s) for diagram number 637 + FFV1_0( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 637 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram638( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 638 OF 1240 *** + // Wavefunction(s) for diagram number 638 + // (none) + // Amplitude(s) for diagram number 638 + FFV1_0( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram639( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 639 OF 1240 *** + // Wavefunction(s) for diagram number 639 + FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); + // Amplitude(s) for diagram number 639 + FFV1_0( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 639 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram640( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 640 OF 1240 *** + // Wavefunction(s) for diagram number 640 + FFV1_1( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); + // Amplitude(s) for diagram number 640 + FFV1_0( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 640 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram641( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 641 OF 1240 *** + // Wavefunction(s) for diagram number 641 + // (none) + // Amplitude(s) for diagram number 641 + FFV1_0( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 641 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram642( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 642 OF 1240 *** + // Wavefunction(s) for diagram number 642 + // (none) + // Amplitude(s) for diagram number 642 + FFV1_0( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 642 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram643( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 643 OF 1240 *** + // Wavefunction(s) for diagram number 643 + FFV1_1( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); + // Amplitude(s) for diagram number 643 + FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 643 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram644( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 644 OF 1240 *** + // Wavefunction(s) for diagram number 644 + // (none) + // Amplitude(s) for diagram number 644 + FFV1_0( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 644 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram645( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 645 OF 1240 *** + // Wavefunction(s) for diagram number 645 + // (none) + // Amplitude(s) for diagram number 645 + FFV1_0( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 645 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram646( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 646 OF 1240 *** + // Wavefunction(s) for diagram number 646 + // (none) + // Amplitude(s) for diagram number 646 + FFV1_0( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 646 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram647( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 647 OF 1240 *** + // Wavefunction(s) for diagram number 647 + // (none) + // Amplitude(s) for diagram number 647 + FFV1_0( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 647 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram648( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 648 OF 1240 *** + // Wavefunction(s) for diagram number 648 + // (none) + // Amplitude(s) for diagram number 648 + FFV1_0( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 648 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram649( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 649 OF 1240 *** + // Wavefunction(s) for diagram number 649 + // (none) + // Amplitude(s) for diagram number 649 + FFV1_0( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 649 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram650( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 650 OF 1240 *** + // Wavefunction(s) for diagram number 650 + // (none) + // Amplitude(s) for diagram number 650 + FFV1_0( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 650 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram651( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 651 OF 1240 *** + // Wavefunction(s) for diagram number 651 + // (none) + // Amplitude(s) for diagram number 651 + FFV1_0( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 651 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram652( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 652 OF 1240 *** + // Wavefunction(s) for diagram number 652 + // (none) + // Amplitude(s) for diagram number 652 + FFV1_0( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 652 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram653( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 653 OF 1240 *** + // Wavefunction(s) for diagram number 653 + // (none) + // Amplitude(s) for diagram number 653 + FFV1_0( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 653 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram654( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 654 OF 1240 *** + // Wavefunction(s) for diagram number 654 + // (none) + // Amplitude(s) for diagram number 654 + VVVV1_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram655( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 655 OF 1240 *** + // Wavefunction(s) for diagram number 655 + VVV1P0_1( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 655 + VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 655 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram656( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 656 OF 1240 *** + // Wavefunction(s) for diagram number 656 + VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] ); + // Amplitude(s) for diagram number 656 + VVV1_0( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 656 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram657( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 657 OF 1240 *** + // Wavefunction(s) for diagram number 657 + // (none) + // Amplitude(s) for diagram number 657 + VVV1_0( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 657 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram658( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 658 OF 1240 *** + // Wavefunction(s) for diagram number 658 + // (none) + // Amplitude(s) for diagram number 658 + FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 658 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram659( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 659 OF 1240 *** + // Wavefunction(s) for diagram number 659 + // (none) + // Amplitude(s) for diagram number 659 + FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 659 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram660( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 660 OF 1240 *** + // Wavefunction(s) for diagram number 660 + // (none) + // Amplitude(s) for diagram number 660 + FFV1_0( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 660 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram661( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 661 OF 1240 *** + // Wavefunction(s) for diagram number 661 + // (none) + // Amplitude(s) for diagram number 661 + FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 661 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram662( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 662 OF 1240 *** + // Wavefunction(s) for diagram number 662 + // (none) + // Amplitude(s) for diagram number 662 + FFV1_0( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 662 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram663( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 663 OF 1240 *** + // Wavefunction(s) for diagram number 663 + // (none) + // Amplitude(s) for diagram number 663 + FFV1_0( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 663 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram664( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 664 OF 1240 *** + // Wavefunction(s) for diagram number 664 + // (none) + // Amplitude(s) for diagram number 664 + FFV1_0( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 664 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram665( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 665 OF 1240 *** + // Wavefunction(s) for diagram number 665 + // (none) + // Amplitude(s) for diagram number 665 + FFV1_0( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 665 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram666( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 666 OF 1240 *** + // Wavefunction(s) for diagram number 666 + // (none) + // Amplitude(s) for diagram number 666 + FFV1_0( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 666 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram667( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 667 OF 1240 *** + // Wavefunction(s) for diagram number 667 + // (none) + // Amplitude(s) for diagram number 667 + FFV1_0( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 667 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram668( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 668 OF 1240 *** + // Wavefunction(s) for diagram number 668 + // (none) + // Amplitude(s) for diagram number 668 + FFV1_0( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 668 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram669( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 669 OF 1240 *** + // Wavefunction(s) for diagram number 669 + // (none) + // Amplitude(s) for diagram number 669 + FFV1_0( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 669 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram670( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 670 OF 1240 *** + // Wavefunction(s) for diagram number 670 + // (none) + // Amplitude(s) for diagram number 670 + VVVV1_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram671( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 671 OF 1240 *** + // Wavefunction(s) for diagram number 671 + VVV1P0_1( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 671 + VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 671 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram672( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 672 OF 1240 *** + // Wavefunction(s) for diagram number 672 + // (none) + // Amplitude(s) for diagram number 672 + VVV1_0( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 672 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram673( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 673 OF 1240 *** + // Wavefunction(s) for diagram number 673 + // (none) + // Amplitude(s) for diagram number 673 + VVV1_0( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 673 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram674( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 674 OF 1240 *** + // Wavefunction(s) for diagram number 674 + // (none) + // Amplitude(s) for diagram number 674 + FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 674 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram675( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 675 OF 1240 *** + // Wavefunction(s) for diagram number 675 + // (none) + // Amplitude(s) for diagram number 675 + FFV1_0( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 675 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram676( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 676 OF 1240 *** + // Wavefunction(s) for diagram number 676 + // (none) + // Amplitude(s) for diagram number 676 + FFV1_0( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 676 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram677( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 677 OF 1240 *** + // Wavefunction(s) for diagram number 677 + // (none) + // Amplitude(s) for diagram number 677 + FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 677 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram678( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 678 OF 1240 *** + // Wavefunction(s) for diagram number 678 + // (none) + // Amplitude(s) for diagram number 678 + FFV1_0( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 678 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram679( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 679 OF 1240 *** + // Wavefunction(s) for diagram number 679 + // (none) + // Amplitude(s) for diagram number 679 + FFV1_0( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 679 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram680( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 680 OF 1240 *** + // Wavefunction(s) for diagram number 680 + VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 680 + VVV1_0( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 680 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram681( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 681 OF 1240 *** + // Wavefunction(s) for diagram number 681 + // (none) + // Amplitude(s) for diagram number 681 + VVV1_0( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 681 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram682( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 682 OF 1240 *** + // Wavefunction(s) for diagram number 682 + // (none) + // Amplitude(s) for diagram number 682 + VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram683( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 683 OF 1240 *** + // Wavefunction(s) for diagram number 683 + // (none) + // Amplitude(s) for diagram number 683 + VVV1_0( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 683 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram684( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 684 OF 1240 *** + // Wavefunction(s) for diagram number 684 + // (none) + // Amplitude(s) for diagram number 684 + VVV1_0( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 684 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram685( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 685 OF 1240 *** + // Wavefunction(s) for diagram number 685 + // (none) + // Amplitude(s) for diagram number 685 + VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram686( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 686 OF 1240 *** + // Wavefunction(s) for diagram number 686 + // (none) + // Amplitude(s) for diagram number 686 + VVV1_0( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 686 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram687( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 687 OF 1240 *** + // Wavefunction(s) for diagram number 687 + // (none) + // Amplitude(s) for diagram number 687 + VVV1_0( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 687 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram688( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 688 OF 1240 *** + // Wavefunction(s) for diagram number 688 + // (none) + // Amplitude(s) for diagram number 688 + VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram689( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 689 OF 1240 *** + // Wavefunction(s) for diagram number 689 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] ); + // Amplitude(s) for diagram number 689 + VVV1_0( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram690( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 690 OF 1240 *** + // Wavefunction(s) for diagram number 690 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 690 + VVV1_0( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram691( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 691 OF 1240 *** + // Wavefunction(s) for diagram number 691 + // (none) + // Amplitude(s) for diagram number 691 + VVV1_0( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram692( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 692 OF 1240 *** + // Wavefunction(s) for diagram number 692 + // (none) + // Amplitude(s) for diagram number 692 + VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram693( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 693 OF 1240 *** + // Wavefunction(s) for diagram number 693 + // (none) + // Amplitude(s) for diagram number 693 + VVV1_0( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 693 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram694( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 694 OF 1240 *** + // Wavefunction(s) for diagram number 694 + // (none) + // Amplitude(s) for diagram number 694 + VVV1_0( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 694 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram695( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 695 OF 1240 *** + // Wavefunction(s) for diagram number 695 + // (none) + // Amplitude(s) for diagram number 695 + VVV1_0( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 695 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram696( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 696 OF 1240 *** + // Wavefunction(s) for diagram number 696 + // (none) + // Amplitude(s) for diagram number 696 + VVV1_0( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 696 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram697( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 697 OF 1240 *** + // Wavefunction(s) for diagram number 697 + // (none) + // Amplitude(s) for diagram number 697 + FFV1_0( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 697 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram698( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 698 OF 1240 *** + // Wavefunction(s) for diagram number 698 + // (none) + // Amplitude(s) for diagram number 698 + FFV1_0( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 698 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram699( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 699 OF 1240 *** + // Wavefunction(s) for diagram number 699 + // (none) + // Amplitude(s) for diagram number 699 + FFV1_0( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 699 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram700( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 700 OF 1240 *** + // Wavefunction(s) for diagram number 700 + // (none) + // Amplitude(s) for diagram number 700 + FFV1_0( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 700 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram701( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 701 OF 1240 *** + // Wavefunction(s) for diagram number 701 + // (none) + // Amplitude(s) for diagram number 701 + VVV1_0( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 701 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram702( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 702 OF 1240 *** + // Wavefunction(s) for diagram number 702 + // (none) + // Amplitude(s) for diagram number 702 + FFV1_0( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram703( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 703 OF 1240 *** + // Wavefunction(s) for diagram number 703 + // (none) + // Amplitude(s) for diagram number 703 + FFV1_0( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 703 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram704( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 704 OF 1240 *** + // Wavefunction(s) for diagram number 704 + // (none) + // Amplitude(s) for diagram number 704 + FFV1_0( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 704 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram705( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 705 OF 1240 *** + // Wavefunction(s) for diagram number 705 + // (none) + // Amplitude(s) for diagram number 705 + FFV1_0( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 705 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram706( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 706 OF 1240 *** + // Wavefunction(s) for diagram number 706 + // (none) + // Amplitude(s) for diagram number 706 + VVV1_0( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 706 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram707( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 707 OF 1240 *** + // Wavefunction(s) for diagram number 707 + // (none) + // Amplitude(s) for diagram number 707 + FFV1_0( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 707 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram708( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 708 OF 1240 *** + // Wavefunction(s) for diagram number 708 + // (none) + // Amplitude(s) for diagram number 708 + FFV1_0( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 708 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram709( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 709 OF 1240 *** + // Wavefunction(s) for diagram number 709 + // (none) + // Amplitude(s) for diagram number 709 + FFV1_0( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 709 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram710( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 710 OF 1240 *** + // Wavefunction(s) for diagram number 710 + // (none) + // Amplitude(s) for diagram number 710 + FFV1_0( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 710 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram711( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 711 OF 1240 *** + // Wavefunction(s) for diagram number 711 + // (none) + // Amplitude(s) for diagram number 711 + VVV1_0( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 711 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram712( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 712 OF 1240 *** + // Wavefunction(s) for diagram number 712 + // (none) + // Amplitude(s) for diagram number 712 + FFV1_0( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram713( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 713 OF 1240 *** + // Wavefunction(s) for diagram number 713 + // (none) + // Amplitude(s) for diagram number 713 + FFV1_0( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 713 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram714( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 714 OF 1240 *** + // Wavefunction(s) for diagram number 714 + // (none) + // Amplitude(s) for diagram number 714 + FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 714 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram715( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 715 OF 1240 *** + // Wavefunction(s) for diagram number 715 + // (none) + // Amplitude(s) for diagram number 715 + FFV1_0( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 715 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram716( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 716 OF 1240 *** + // Wavefunction(s) for diagram number 716 + // (none) + // Amplitude(s) for diagram number 716 + VVV1_0( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 716 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram717( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 717 OF 1240 *** + // Wavefunction(s) for diagram number 717 + // (none) + // Amplitude(s) for diagram number 717 + FFV1_0( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 717 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram718( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 718 OF 1240 *** + // Wavefunction(s) for diagram number 718 + // (none) + // Amplitude(s) for diagram number 718 + FFV1_0( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 718 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram719( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 719 OF 1240 *** + // Wavefunction(s) for diagram number 719 + // (none) + // Amplitude(s) for diagram number 719 + FFV1_0( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 719 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram720( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 720 OF 1240 *** + // Wavefunction(s) for diagram number 720 + // (none) + // Amplitude(s) for diagram number 720 + FFV1_0( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 720 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram721( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 721 OF 1240 *** + // Wavefunction(s) for diagram number 721 + // (none) + // Amplitude(s) for diagram number 721 + VVV1_0( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 721 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram722( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 722 OF 1240 *** + // Wavefunction(s) for diagram number 722 + // (none) + // Amplitude(s) for diagram number 722 + FFV1_0( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram723( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 723 OF 1240 *** + // Wavefunction(s) for diagram number 723 + // (none) + // Amplitude(s) for diagram number 723 + VVV1_0( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 723 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram724( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 724 OF 1240 *** + // Wavefunction(s) for diagram number 724 + // (none) + // Amplitude(s) for diagram number 724 + FFV1_0( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 724 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram725( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 725 OF 1240 *** + // Wavefunction(s) for diagram number 725 + // (none) + // Amplitude(s) for diagram number 725 + FFV1_0( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 725 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram726( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 726 OF 1240 *** + // Wavefunction(s) for diagram number 726 + // (none) + // Amplitude(s) for diagram number 726 + FFV1_0( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 726 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram727( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 727 OF 1240 *** + // Wavefunction(s) for diagram number 727 + // (none) + // Amplitude(s) for diagram number 727 + FFV1_0( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 727 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram728( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 728 OF 1240 *** + // Wavefunction(s) for diagram number 728 + // (none) + // Amplitude(s) for diagram number 728 + VVV1_0( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 728 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram729( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 729 OF 1240 *** + // Wavefunction(s) for diagram number 729 + // (none) + // Amplitude(s) for diagram number 729 + FFV1_0( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram730( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 730 OF 1240 *** + // Wavefunction(s) for diagram number 730 + // (none) + // Amplitude(s) for diagram number 730 + FFV1_0( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 730 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram731( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 731 OF 1240 *** + // Wavefunction(s) for diagram number 731 + // (none) + // Amplitude(s) for diagram number 731 + FFV1_0( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 731 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram732( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 732 OF 1240 *** + // Wavefunction(s) for diagram number 732 + // (none) + // Amplitude(s) for diagram number 732 + FFV1_0( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 732 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram733( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 733 OF 1240 *** + // Wavefunction(s) for diagram number 733 + // (none) + // Amplitude(s) for diagram number 733 + FFV1_0( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 733 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram734( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 734 OF 1240 *** + // Wavefunction(s) for diagram number 734 + // (none) + // Amplitude(s) for diagram number 734 + FFV1_0( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 734 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram735( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 735 OF 1240 *** + // Wavefunction(s) for diagram number 735 + // (none) + // Amplitude(s) for diagram number 735 + FFV1_0( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 735 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram736( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 736 OF 1240 *** + // Wavefunction(s) for diagram number 736 + // (none) + // Amplitude(s) for diagram number 736 + FFV1_0( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram737( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 737 OF 1240 *** + // Wavefunction(s) for diagram number 737 + // (none) + // Amplitude(s) for diagram number 737 + FFV1_0( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram738( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 738 OF 1240 *** + // Wavefunction(s) for diagram number 738 + // (none) + // Amplitude(s) for diagram number 738 + VVV1_0( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVV1_0( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram739( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 739 OF 1240 *** + // Wavefunction(s) for diagram number 739 + FFV1_1( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] ); + // Amplitude(s) for diagram number 739 + FFV1_0( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 739 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram740( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 740 OF 1240 *** + // Wavefunction(s) for diagram number 740 + // (none) + // Amplitude(s) for diagram number 740 + FFV1_0( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 740 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram741( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 741 OF 1240 *** + // Wavefunction(s) for diagram number 741 + FFV1_2( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 741 + FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 741 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram742( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 742 OF 1240 *** + // Wavefunction(s) for diagram number 742 + // (none) + // Amplitude(s) for diagram number 742 + FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 742 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram743( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 743 OF 1240 *** + // Wavefunction(s) for diagram number 743 + // (none) + // Amplitude(s) for diagram number 743 + FFV1_0( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 743 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram744( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 744 OF 1240 *** + // Wavefunction(s) for diagram number 744 + // (none) + // Amplitude(s) for diagram number 744 + FFV1_0( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 744 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram745( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 745 OF 1240 *** + // Wavefunction(s) for diagram number 745 + // (none) + // Amplitude(s) for diagram number 745 + FFV1_0( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 745 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram746( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 746 OF 1240 *** + // Wavefunction(s) for diagram number 746 + // (none) + // Amplitude(s) for diagram number 746 + FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 746 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram747( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 747 OF 1240 *** + // Wavefunction(s) for diagram number 747 + VVV1P0_1( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] ); + // Amplitude(s) for diagram number 747 + FFV1_0( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 747 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram748( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 748 OF 1240 *** + // Wavefunction(s) for diagram number 748 + // (none) + // Amplitude(s) for diagram number 748 + FFV1_0( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 748 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram749( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 749 OF 1240 *** + // Wavefunction(s) for diagram number 749 + // (none) + // Amplitude(s) for diagram number 749 + FFV1_0( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 749 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram750( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 750 OF 1240 *** + // Wavefunction(s) for diagram number 750 + FFV1_2( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); + // Amplitude(s) for diagram number 750 + FFV1_0( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 750 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram751( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 751 OF 1240 *** + // Wavefunction(s) for diagram number 751 + // (none) + // Amplitude(s) for diagram number 751 + FFV1_0( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 751 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram752( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 752 OF 1240 *** + // Wavefunction(s) for diagram number 752 + // (none) + // Amplitude(s) for diagram number 752 + FFV1_0( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 752 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram753( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 753 OF 1240 *** + // Wavefunction(s) for diagram number 753 + // (none) + // Amplitude(s) for diagram number 753 + FFV1_0( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 753 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram754( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 754 OF 1240 *** + // Wavefunction(s) for diagram number 754 + // (none) + // Amplitude(s) for diagram number 754 + FFV1_0( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 754 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram755( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 755 OF 1240 *** + // Wavefunction(s) for diagram number 755 + // (none) + // Amplitude(s) for diagram number 755 + FFV1_0( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 755 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram756( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 756 OF 1240 *** + // Wavefunction(s) for diagram number 756 + VVV1P0_1( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] ); + // Amplitude(s) for diagram number 756 + FFV1_0( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 756 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram757( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 757 OF 1240 *** + // Wavefunction(s) for diagram number 757 + // (none) + // Amplitude(s) for diagram number 757 + FFV1_0( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 757 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram758( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 758 OF 1240 *** + // Wavefunction(s) for diagram number 758 + // (none) + // Amplitude(s) for diagram number 758 + FFV1_0( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 758 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram759( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 759 OF 1240 *** + // Wavefunction(s) for diagram number 759 + FFV1_2( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); + // Amplitude(s) for diagram number 759 + FFV1_0( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 759 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram760( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 760 OF 1240 *** + // Wavefunction(s) for diagram number 760 + // (none) + // Amplitude(s) for diagram number 760 + FFV1_0( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 760 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram761( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 761 OF 1240 *** + // Wavefunction(s) for diagram number 761 + // (none) + // Amplitude(s) for diagram number 761 + FFV1_0( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 761 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram762( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 762 OF 1240 *** + // Wavefunction(s) for diagram number 762 + // (none) + // Amplitude(s) for diagram number 762 + FFV1_0( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 762 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram763( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 763 OF 1240 *** + // Wavefunction(s) for diagram number 763 + // (none) + // Amplitude(s) for diagram number 763 + FFV1_0( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 763 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram764( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 764 OF 1240 *** + // Wavefunction(s) for diagram number 764 + // (none) + // Amplitude(s) for diagram number 764 + FFV1_0( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 764 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram765( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 765 OF 1240 *** + // Wavefunction(s) for diagram number 765 + VVV1P0_1( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] ); + // Amplitude(s) for diagram number 765 + FFV1_0( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 765 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram766( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 766 OF 1240 *** + // Wavefunction(s) for diagram number 766 + // (none) + // Amplitude(s) for diagram number 766 + FFV1_0( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 766 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram767( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 767 OF 1240 *** + // Wavefunction(s) for diagram number 767 + // (none) + // Amplitude(s) for diagram number 767 + FFV1_0( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 767 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram768( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 768 OF 1240 *** + // Wavefunction(s) for diagram number 768 + // (none) + // Amplitude(s) for diagram number 768 + VVV1_0( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 768 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram769( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 769 OF 1240 *** + // Wavefunction(s) for diagram number 769 + // (none) + // Amplitude(s) for diagram number 769 + FFV1_0( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 769 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram770( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 770 OF 1240 *** + // Wavefunction(s) for diagram number 770 + // (none) + // Amplitude(s) for diagram number 770 + VVV1_0( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 770 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram771( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 771 OF 1240 *** + // Wavefunction(s) for diagram number 771 + // (none) + // Amplitude(s) for diagram number 771 + FFV1_0( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 771 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram772( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 772 OF 1240 *** + // Wavefunction(s) for diagram number 772 + VVVV1P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] ); + VVVV3P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] ); + VVVV4P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 772 + FFV1_0( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram773( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 773 OF 1240 *** + // Wavefunction(s) for diagram number 773 + // (none) + // Amplitude(s) for diagram number 773 + FFV1_0( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 773 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram774( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 774 OF 1240 *** + // Wavefunction(s) for diagram number 774 + // (none) + // Amplitude(s) for diagram number 774 + FFV1_0( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 774 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram775( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 775 OF 1240 *** + // Wavefunction(s) for diagram number 775 + // (none) + // Amplitude(s) for diagram number 775 + VVV1_0( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 775 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram776( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 776 OF 1240 *** + // Wavefunction(s) for diagram number 776 + // (none) + // Amplitude(s) for diagram number 776 + FFV1_0( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 776 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram777( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 777 OF 1240 *** + // Wavefunction(s) for diagram number 777 + // (none) + // Amplitude(s) for diagram number 777 + VVV1_0( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 777 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram778( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 778 OF 1240 *** + // Wavefunction(s) for diagram number 778 + // (none) + // Amplitude(s) for diagram number 778 + FFV1_0( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 778 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram779( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 779 OF 1240 *** + // Wavefunction(s) for diagram number 779 + VVVV1P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] ); + VVVV3P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); + VVVV4P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); + // Amplitude(s) for diagram number 779 + FFV1_0( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram780( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 780 OF 1240 *** + // Wavefunction(s) for diagram number 780 + // (none) + // Amplitude(s) for diagram number 780 + FFV1_0( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 780 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram781( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 781 OF 1240 *** + // Wavefunction(s) for diagram number 781 + // (none) + // Amplitude(s) for diagram number 781 + FFV1_0( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 781 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram782( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 782 OF 1240 *** + // Wavefunction(s) for diagram number 782 + // (none) + // Amplitude(s) for diagram number 782 + VVV1_0( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 782 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram783( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 783 OF 1240 *** + // Wavefunction(s) for diagram number 783 + // (none) + // Amplitude(s) for diagram number 783 + FFV1_0( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 783 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram784( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 784 OF 1240 *** + // Wavefunction(s) for diagram number 784 + // (none) + // Amplitude(s) for diagram number 784 + VVV1_0( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 784 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram785( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 785 OF 1240 *** + // Wavefunction(s) for diagram number 785 + // (none) + // Amplitude(s) for diagram number 785 + FFV1_0( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 785 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram786( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 786 OF 1240 *** + // Wavefunction(s) for diagram number 786 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 786 + FFV1_0( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram787( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 787 OF 1240 *** + // Wavefunction(s) for diagram number 787 + // (none) + // Amplitude(s) for diagram number 787 + FFV1_0( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram788( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 788 OF 1240 *** + // Wavefunction(s) for diagram number 788 + VVV1P0_1( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] ); + VVV1P0_1( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] ); + VVV1P0_1( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] ); + // Amplitude(s) for diagram number 788 + FFV1_0( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram789( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 789 OF 1240 *** + // Wavefunction(s) for diagram number 789 + FFV1_2( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); + // Amplitude(s) for diagram number 789 + FFV1_0( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 789 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram790( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 790 OF 1240 *** + // Wavefunction(s) for diagram number 790 + // (none) + // Amplitude(s) for diagram number 790 + FFV1_0( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 790 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram791( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 791 OF 1240 *** + // Wavefunction(s) for diagram number 791 + FFV1_1( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); + // Amplitude(s) for diagram number 791 + FFV1_0( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 791 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram792( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 792 OF 1240 *** + // Wavefunction(s) for diagram number 792 + // (none) + // Amplitude(s) for diagram number 792 + FFV1_0( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 792 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram793( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 793 OF 1240 *** + // Wavefunction(s) for diagram number 793 + // (none) + // Amplitude(s) for diagram number 793 + FFV1_0( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 793 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram794( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 794 OF 1240 *** + // Wavefunction(s) for diagram number 794 + // (none) + // Amplitude(s) for diagram number 794 + FFV1_0( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 794 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram795( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 795 OF 1240 *** + // Wavefunction(s) for diagram number 795 + // (none) + // Amplitude(s) for diagram number 795 + FFV1_0( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 795 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram796( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 796 OF 1240 *** + // Wavefunction(s) for diagram number 796 + // (none) + // Amplitude(s) for diagram number 796 + FFV1_0( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 796 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram797( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 797 OF 1240 *** + // Wavefunction(s) for diagram number 797 + // (none) + // Amplitude(s) for diagram number 797 + FFV1_0( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 797 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram798( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 798 OF 1240 *** + // Wavefunction(s) for diagram number 798 + // (none) + // Amplitude(s) for diagram number 798 + FFV1_0( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 798 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram799( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 799 OF 1240 *** + // Wavefunction(s) for diagram number 799 + // (none) + // Amplitude(s) for diagram number 799 + FFV1_0( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 799 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram800( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 800 OF 1240 *** + // Wavefunction(s) for diagram number 800 + FFV1_1( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); + // Amplitude(s) for diagram number 800 + FFV1_0( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 800 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram801( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 801 OF 1240 *** + // Wavefunction(s) for diagram number 801 + // (none) + // Amplitude(s) for diagram number 801 + FFV1_0( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 801 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram802( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 802 OF 1240 *** + // Wavefunction(s) for diagram number 802 + // (none) + // Amplitude(s) for diagram number 802 + FFV1_0( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 802 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram803( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 803 OF 1240 *** + // Wavefunction(s) for diagram number 803 + // (none) + // Amplitude(s) for diagram number 803 + FFV1_0( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 803 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram804( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 804 OF 1240 *** + // Wavefunction(s) for diagram number 804 + // (none) + // Amplitude(s) for diagram number 804 + FFV1_0( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 804 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram805( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 805 OF 1240 *** + // Wavefunction(s) for diagram number 805 + // (none) + // Amplitude(s) for diagram number 805 + FFV1_0( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 805 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram806( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 806 OF 1240 *** + // Wavefunction(s) for diagram number 806 + // (none) + // Amplitude(s) for diagram number 806 + FFV1_0( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 806 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram807( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 807 OF 1240 *** + // Wavefunction(s) for diagram number 807 + // (none) + // Amplitude(s) for diagram number 807 + FFV1_0( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 807 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram808( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 808 OF 1240 *** + // Wavefunction(s) for diagram number 808 + // (none) + // Amplitude(s) for diagram number 808 + FFV1_0( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 808 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram809( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 809 OF 1240 *** + // Wavefunction(s) for diagram number 809 + FFV1_1( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] ); + // Amplitude(s) for diagram number 809 + FFV1_0( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 809 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram810( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 810 OF 1240 *** + // Wavefunction(s) for diagram number 810 + // (none) + // Amplitude(s) for diagram number 810 + FFV1_0( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 810 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram811( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 811 OF 1240 *** + // Wavefunction(s) for diagram number 811 + // (none) + // Amplitude(s) for diagram number 811 + FFV1_0( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 811 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram812( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 812 OF 1240 *** + // Wavefunction(s) for diagram number 812 + // (none) + // Amplitude(s) for diagram number 812 + FFV1_0( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 812 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram813( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 813 OF 1240 *** + // Wavefunction(s) for diagram number 813 + // (none) + // Amplitude(s) for diagram number 813 + FFV1_0( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 813 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram814( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 814 OF 1240 *** + // Wavefunction(s) for diagram number 814 + // (none) + // Amplitude(s) for diagram number 814 + FFV1_0( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 814 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram815( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 815 OF 1240 *** + // Wavefunction(s) for diagram number 815 + // (none) + // Amplitude(s) for diagram number 815 + FFV1_0( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 815 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram816( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 816 OF 1240 *** + // Wavefunction(s) for diagram number 816 + // (none) + // Amplitude(s) for diagram number 816 + FFV1_0( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 816 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram817( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 817 OF 1240 *** + // Wavefunction(s) for diagram number 817 + // (none) + // Amplitude(s) for diagram number 817 + FFV1_0( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 817 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram818( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 818 OF 1240 *** + // Wavefunction(s) for diagram number 818 + // (none) + // Amplitude(s) for diagram number 818 + VVV1_0( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 818 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram819( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 819 OF 1240 *** + // Wavefunction(s) for diagram number 819 + // (none) + // Amplitude(s) for diagram number 819 + FFV1_0( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 819 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram820( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 820 OF 1240 *** + // Wavefunction(s) for diagram number 820 + // (none) + // Amplitude(s) for diagram number 820 + VVV1_0( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 820 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram821( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 821 OF 1240 *** + // Wavefunction(s) for diagram number 821 + // (none) + // Amplitude(s) for diagram number 821 + FFV1_0( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 821 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram822( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 822 OF 1240 *** + // Wavefunction(s) for diagram number 822 + // (none) + // Amplitude(s) for diagram number 822 + FFV1_0( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram823( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 823 OF 1240 *** + // Wavefunction(s) for diagram number 823 + // (none) + // Amplitude(s) for diagram number 823 + FFV1_0( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 823 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram824( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 824 OF 1240 *** + // Wavefunction(s) for diagram number 824 + // (none) + // Amplitude(s) for diagram number 824 + FFV1_0( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 824 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram825( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 825 OF 1240 *** + // Wavefunction(s) for diagram number 825 + // (none) + // Amplitude(s) for diagram number 825 + VVV1_0( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 825 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram826( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 826 OF 1240 *** + // Wavefunction(s) for diagram number 826 + // (none) + // Amplitude(s) for diagram number 826 + FFV1_0( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 826 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram827( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 827 OF 1240 *** + // Wavefunction(s) for diagram number 827 + // (none) + // Amplitude(s) for diagram number 827 + VVV1_0( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 827 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram828( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 828 OF 1240 *** + // Wavefunction(s) for diagram number 828 + // (none) + // Amplitude(s) for diagram number 828 + FFV1_0( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 828 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram829( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 829 OF 1240 *** + // Wavefunction(s) for diagram number 829 + // (none) + // Amplitude(s) for diagram number 829 + FFV1_0( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram830( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 830 OF 1240 *** + // Wavefunction(s) for diagram number 830 + // (none) + // Amplitude(s) for diagram number 830 + FFV1_0( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 830 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram831( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 831 OF 1240 *** + // Wavefunction(s) for diagram number 831 + // (none) + // Amplitude(s) for diagram number 831 + FFV1_0( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 831 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram832( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 832 OF 1240 *** + // Wavefunction(s) for diagram number 832 + // (none) + // Amplitude(s) for diagram number 832 + VVV1_0( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 832 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram833( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 833 OF 1240 *** + // Wavefunction(s) for diagram number 833 + // (none) + // Amplitude(s) for diagram number 833 + FFV1_0( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 833 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram834( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 834 OF 1240 *** + // Wavefunction(s) for diagram number 834 + // (none) + // Amplitude(s) for diagram number 834 + VVV1_0( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 834 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram835( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 835 OF 1240 *** + // Wavefunction(s) for diagram number 835 + // (none) + // Amplitude(s) for diagram number 835 + FFV1_0( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 835 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram836( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 836 OF 1240 *** + // Wavefunction(s) for diagram number 836 + // (none) + // Amplitude(s) for diagram number 836 + FFV1_0( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram837( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 837 OF 1240 *** + // Wavefunction(s) for diagram number 837 + // (none) + // Amplitude(s) for diagram number 837 + FFV1_0( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + FFV1_0( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + FFV1_0( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram838( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 838 OF 1240 *** + // Wavefunction(s) for diagram number 838 + // (none) + // Amplitude(s) for diagram number 838 + FFV1_0( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram839( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 839 OF 1240 *** + // Wavefunction(s) for diagram number 839 + VVV1P0_1( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] ); + // Amplitude(s) for diagram number 839 + VVV1_0( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 839 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram840( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 840 OF 1240 *** + // Wavefunction(s) for diagram number 840 + // (none) + // Amplitude(s) for diagram number 840 + VVV1_0( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 840 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram841( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 841 OF 1240 *** + // Wavefunction(s) for diagram number 841 + // (none) + // Amplitude(s) for diagram number 841 + VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram842( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 842 OF 1240 *** + // Wavefunction(s) for diagram number 842 + VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] ); + // Amplitude(s) for diagram number 842 + VVV1_0( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 842 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram843( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 843 OF 1240 *** + // Wavefunction(s) for diagram number 843 + // (none) + // Amplitude(s) for diagram number 843 + VVV1_0( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 843 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram844( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 844 OF 1240 *** + // Wavefunction(s) for diagram number 844 + // (none) + // Amplitude(s) for diagram number 844 + VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram845( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 845 OF 1240 *** + // Wavefunction(s) for diagram number 845 + // (none) + // Amplitude(s) for diagram number 845 + VVV1_0( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 845 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram846( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 846 OF 1240 *** + // Wavefunction(s) for diagram number 846 + // (none) + // Amplitude(s) for diagram number 846 + VVV1_0( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 846 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram847( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 847 OF 1240 *** + // Wavefunction(s) for diagram number 847 + VVVV1P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] ); + VVVV3P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] ); + VVVV4P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 847 + VVV1_0( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram848( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 848 OF 1240 *** + // Wavefunction(s) for diagram number 848 + VVVV1P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] ); + VVVV3P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] ); + VVVV4P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] ); + // Amplitude(s) for diagram number 848 + VVV1_0( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram849( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 849 OF 1240 *** + // Wavefunction(s) for diagram number 849 + VVVV1P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] ); + VVVV3P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] ); + VVVV4P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] ); + // Amplitude(s) for diagram number 849 + VVV1_0( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVV1_0( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVV1_0( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram850( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 850 OF 1240 *** + // Wavefunction(s) for diagram number 850 + VVVV1P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] ); + VVVV3P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] ); + VVVV4P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] ); + // Amplitude(s) for diagram number 850 + VVV1_0( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVV1_0( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram851( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 851 OF 1240 *** + // Wavefunction(s) for diagram number 851 + // (none) + // Amplitude(s) for diagram number 851 + VVVV1_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram852( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 852 OF 1240 *** + // Wavefunction(s) for diagram number 852 + // (none) + // Amplitude(s) for diagram number 852 + VVV1_0( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 852 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram853( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 853 OF 1240 *** + // Wavefunction(s) for diagram number 853 + // (none) + // Amplitude(s) for diagram number 853 + VVV1_0( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 853 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram854( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 854 OF 1240 *** + // Wavefunction(s) for diagram number 854 + // (none) + // Amplitude(s) for diagram number 854 + VVV1_0( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 854 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram855( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 855 OF 1240 *** + // Wavefunction(s) for diagram number 855 + // (none) + // Amplitude(s) for diagram number 855 + VVV1_0( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 855 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram856( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 856 OF 1240 *** + // Wavefunction(s) for diagram number 856 + // (none) + // Amplitude(s) for diagram number 856 + FFV1_0( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 856 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram857( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 857 OF 1240 *** + // Wavefunction(s) for diagram number 857 + // (none) + // Amplitude(s) for diagram number 857 + FFV1_0( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 857 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram858( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 858 OF 1240 *** + // Wavefunction(s) for diagram number 858 + // (none) + // Amplitude(s) for diagram number 858 + FFV1_0( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 858 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram859( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 859 OF 1240 *** + // Wavefunction(s) for diagram number 859 + // (none) + // Amplitude(s) for diagram number 859 + FFV1_0( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 859 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram860( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 860 OF 1240 *** + // Wavefunction(s) for diagram number 860 + // (none) + // Amplitude(s) for diagram number 860 + VVV1_0( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 860 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram861( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 861 OF 1240 *** + // Wavefunction(s) for diagram number 861 + // (none) + // Amplitude(s) for diagram number 861 + FFV1_0( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram862( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 862 OF 1240 *** + // Wavefunction(s) for diagram number 862 + // (none) + // Amplitude(s) for diagram number 862 + FFV1_0( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 862 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram863( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 863 OF 1240 *** + // Wavefunction(s) for diagram number 863 + // (none) + // Amplitude(s) for diagram number 863 + FFV1_0( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 863 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram864( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 864 OF 1240 *** + // Wavefunction(s) for diagram number 864 + // (none) + // Amplitude(s) for diagram number 864 + FFV1_0( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 864 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram865( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 865 OF 1240 *** + // Wavefunction(s) for diagram number 865 + // (none) + // Amplitude(s) for diagram number 865 + VVV1_0( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 865 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram866( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 866 OF 1240 *** + // Wavefunction(s) for diagram number 866 + // (none) + // Amplitude(s) for diagram number 866 + FFV1_0( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 866 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram867( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 867 OF 1240 *** + // Wavefunction(s) for diagram number 867 + // (none) + // Amplitude(s) for diagram number 867 + FFV1_0( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 867 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram868( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 868 OF 1240 *** + // Wavefunction(s) for diagram number 868 + // (none) + // Amplitude(s) for diagram number 868 + FFV1_0( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 868 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram869( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 869 OF 1240 *** + // Wavefunction(s) for diagram number 869 + // (none) + // Amplitude(s) for diagram number 869 + FFV1_0( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 869 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram870( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 870 OF 1240 *** + // Wavefunction(s) for diagram number 870 + // (none) + // Amplitude(s) for diagram number 870 + VVV1_0( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 870 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram871( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 871 OF 1240 *** + // Wavefunction(s) for diagram number 871 + // (none) + // Amplitude(s) for diagram number 871 + FFV1_0( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram872( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 872 OF 1240 *** + // Wavefunction(s) for diagram number 872 + // (none) + // Amplitude(s) for diagram number 872 + FFV1_0( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 872 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram873( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 873 OF 1240 *** + // Wavefunction(s) for diagram number 873 + // (none) + // Amplitude(s) for diagram number 873 + FFV1_0( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 873 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram874( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 874 OF 1240 *** + // Wavefunction(s) for diagram number 874 + // (none) + // Amplitude(s) for diagram number 874 + FFV1_0( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 874 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram875( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 875 OF 1240 *** + // Wavefunction(s) for diagram number 875 + // (none) + // Amplitude(s) for diagram number 875 + VVV1_0( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 875 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram876( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 876 OF 1240 *** + // Wavefunction(s) for diagram number 876 + // (none) + // Amplitude(s) for diagram number 876 + FFV1_0( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 876 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram877( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 877 OF 1240 *** + // Wavefunction(s) for diagram number 877 + // (none) + // Amplitude(s) for diagram number 877 + FFV1_0( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 877 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram878( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 878 OF 1240 *** + // Wavefunction(s) for diagram number 878 + // (none) + // Amplitude(s) for diagram number 878 + FFV1_0( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 878 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram879( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 879 OF 1240 *** + // Wavefunction(s) for diagram number 879 + // (none) + // Amplitude(s) for diagram number 879 + FFV1_0( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 879 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram880( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 880 OF 1240 *** + // Wavefunction(s) for diagram number 880 + // (none) + // Amplitude(s) for diagram number 880 + VVV1_0( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 880 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram881( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 881 OF 1240 *** + // Wavefunction(s) for diagram number 881 + // (none) + // Amplitude(s) for diagram number 881 + FFV1_0( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram882( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 882 OF 1240 *** + // Wavefunction(s) for diagram number 882 + // (none) + // Amplitude(s) for diagram number 882 + VVV1_0( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 882 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram883( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 883 OF 1240 *** + // Wavefunction(s) for diagram number 883 + // (none) + // Amplitude(s) for diagram number 883 + FFV1_0( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 883 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram884( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 884 OF 1240 *** + // Wavefunction(s) for diagram number 884 + // (none) + // Amplitude(s) for diagram number 884 + FFV1_0( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 884 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram885( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 885 OF 1240 *** + // Wavefunction(s) for diagram number 885 + // (none) + // Amplitude(s) for diagram number 885 + FFV1_0( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 885 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram886( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 886 OF 1240 *** + // Wavefunction(s) for diagram number 886 + // (none) + // Amplitude(s) for diagram number 886 + FFV1_0( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 886 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram887( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 887 OF 1240 *** + // Wavefunction(s) for diagram number 887 + // (none) + // Amplitude(s) for diagram number 887 + VVV1_0( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 887 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram888( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 888 OF 1240 *** + // Wavefunction(s) for diagram number 888 + // (none) + // Amplitude(s) for diagram number 888 + FFV1_0( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram889( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 889 OF 1240 *** + // Wavefunction(s) for diagram number 889 + // (none) + // Amplitude(s) for diagram number 889 + FFV1_0( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 889 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram890( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 890 OF 1240 *** + // Wavefunction(s) for diagram number 890 + // (none) + // Amplitude(s) for diagram number 890 + FFV1_0( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 890 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram891( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 891 OF 1240 *** + // Wavefunction(s) for diagram number 891 + // (none) + // Amplitude(s) for diagram number 891 + FFV1_0( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 891 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram892( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 892 OF 1240 *** + // Wavefunction(s) for diagram number 892 + // (none) + // Amplitude(s) for diagram number 892 + FFV1_0( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 892 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram893( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 893 OF 1240 *** + // Wavefunction(s) for diagram number 893 + // (none) + // Amplitude(s) for diagram number 893 + FFV1_0( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 893 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram894( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 894 OF 1240 *** + // Wavefunction(s) for diagram number 894 + // (none) + // Amplitude(s) for diagram number 894 + FFV1_0( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 894 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram895( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 895 OF 1240 *** + // Wavefunction(s) for diagram number 895 + VVV1P0_1( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] ); + // Amplitude(s) for diagram number 895 + VVV1_0( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 895 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram896( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 896 OF 1240 *** + // Wavefunction(s) for diagram number 896 + // (none) + // Amplitude(s) for diagram number 896 + VVV1_0( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 896 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram897( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 897 OF 1240 *** + // Wavefunction(s) for diagram number 897 + // (none) + // Amplitude(s) for diagram number 897 + VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram898( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 898 OF 1240 *** + // Wavefunction(s) for diagram number 898 + // (none) + // Amplitude(s) for diagram number 898 + VVV1_0( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 898 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram899( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 899 OF 1240 *** + // Wavefunction(s) for diagram number 899 + // (none) + // Amplitude(s) for diagram number 899 + VVV1_0( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 899 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram900( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 900 OF 1240 *** + // Wavefunction(s) for diagram number 900 + // (none) + // Amplitude(s) for diagram number 900 + VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram901( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 901 OF 1240 *** + // Wavefunction(s) for diagram number 901 + // (none) + // Amplitude(s) for diagram number 901 + VVV1_0( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 901 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram902( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 902 OF 1240 *** + // Wavefunction(s) for diagram number 902 + // (none) + // Amplitude(s) for diagram number 902 + VVV1_0( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 902 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram903( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 903 OF 1240 *** + // Wavefunction(s) for diagram number 903 + VVVV1P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] ); + VVVV3P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] ); + VVVV4P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 903 + VVV1_0( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram904( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 904 OF 1240 *** + // Wavefunction(s) for diagram number 904 + VVVV1P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] ); + VVVV3P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] ); + VVVV4P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] ); + // Amplitude(s) for diagram number 904 + VVV1_0( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram905( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 905 OF 1240 *** + // Wavefunction(s) for diagram number 905 + VVVV1P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] ); + VVVV3P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] ); + VVVV4P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] ); + // Amplitude(s) for diagram number 905 + VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVV1_0( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram906( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 906 OF 1240 *** + // Wavefunction(s) for diagram number 906 + // (none) + // Amplitude(s) for diagram number 906 + VVV1_0( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVV1_0( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + VVV1_0( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram907( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 907 OF 1240 *** + // Wavefunction(s) for diagram number 907 + // (none) + // Amplitude(s) for diagram number 907 + VVVV1_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram908( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 908 OF 1240 *** + // Wavefunction(s) for diagram number 908 + // (none) + // Amplitude(s) for diagram number 908 + VVV1_0( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 908 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram909( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 909 OF 1240 *** + // Wavefunction(s) for diagram number 909 + // (none) + // Amplitude(s) for diagram number 909 + VVV1_0( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 909 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram910( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 910 OF 1240 *** + // Wavefunction(s) for diagram number 910 + // (none) + // Amplitude(s) for diagram number 910 + VVV1_0( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 910 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram911( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 911 OF 1240 *** + // Wavefunction(s) for diagram number 911 + // (none) + // Amplitude(s) for diagram number 911 + VVV1_0( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 911 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram912( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 912 OF 1240 *** + // Wavefunction(s) for diagram number 912 + // (none) + // Amplitude(s) for diagram number 912 + FFV1_0( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 912 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram913( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 913 OF 1240 *** + // Wavefunction(s) for diagram number 913 + // (none) + // Amplitude(s) for diagram number 913 + FFV1_0( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 913 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram914( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 914 OF 1240 *** + // Wavefunction(s) for diagram number 914 + // (none) + // Amplitude(s) for diagram number 914 + FFV1_0( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 914 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram915( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 915 OF 1240 *** + // Wavefunction(s) for diagram number 915 + // (none) + // Amplitude(s) for diagram number 915 + FFV1_0( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 915 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram916( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 916 OF 1240 *** + // Wavefunction(s) for diagram number 916 + // (none) + // Amplitude(s) for diagram number 916 + VVV1_0( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 916 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram917( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 917 OF 1240 *** + // Wavefunction(s) for diagram number 917 + // (none) + // Amplitude(s) for diagram number 917 + FFV1_0( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram918( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 918 OF 1240 *** + // Wavefunction(s) for diagram number 918 + // (none) + // Amplitude(s) for diagram number 918 + FFV1_0( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 918 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram919( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 919 OF 1240 *** + // Wavefunction(s) for diagram number 919 + // (none) + // Amplitude(s) for diagram number 919 + FFV1_0( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 919 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram920( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 920 OF 1240 *** + // Wavefunction(s) for diagram number 920 + // (none) + // Amplitude(s) for diagram number 920 + FFV1_0( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 920 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram921( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 921 OF 1240 *** + // Wavefunction(s) for diagram number 921 + // (none) + // Amplitude(s) for diagram number 921 + VVV1_0( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 921 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram922( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 922 OF 1240 *** + // Wavefunction(s) for diagram number 922 + // (none) + // Amplitude(s) for diagram number 922 + FFV1_0( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 922 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram923( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 923 OF 1240 *** + // Wavefunction(s) for diagram number 923 + // (none) + // Amplitude(s) for diagram number 923 + FFV1_0( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 923 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram924( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 924 OF 1240 *** + // Wavefunction(s) for diagram number 924 + // (none) + // Amplitude(s) for diagram number 924 + FFV1_0( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 924 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram925( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 925 OF 1240 *** + // Wavefunction(s) for diagram number 925 + // (none) + // Amplitude(s) for diagram number 925 + FFV1_0( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 925 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram926( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 926 OF 1240 *** + // Wavefunction(s) for diagram number 926 + // (none) + // Amplitude(s) for diagram number 926 + VVV1_0( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 926 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram927( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 927 OF 1240 *** + // Wavefunction(s) for diagram number 927 + // (none) + // Amplitude(s) for diagram number 927 + FFV1_0( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram928( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 928 OF 1240 *** + // Wavefunction(s) for diagram number 928 + // (none) + // Amplitude(s) for diagram number 928 + FFV1_0( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 928 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram929( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 929 OF 1240 *** + // Wavefunction(s) for diagram number 929 + // (none) + // Amplitude(s) for diagram number 929 + FFV1_0( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 929 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram930( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 930 OF 1240 *** + // Wavefunction(s) for diagram number 930 + // (none) + // Amplitude(s) for diagram number 930 + FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 930 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram931( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 931 OF 1240 *** + // Wavefunction(s) for diagram number 931 + // (none) + // Amplitude(s) for diagram number 931 + VVV1_0( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 931 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram932( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 932 OF 1240 *** + // Wavefunction(s) for diagram number 932 + // (none) + // Amplitude(s) for diagram number 932 + FFV1_0( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 932 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram933( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 933 OF 1240 *** + // Wavefunction(s) for diagram number 933 + // (none) + // Amplitude(s) for diagram number 933 + FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 933 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram934( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 934 OF 1240 *** + // Wavefunction(s) for diagram number 934 + // (none) + // Amplitude(s) for diagram number 934 + FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 934 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram935( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 935 OF 1240 *** + // Wavefunction(s) for diagram number 935 + // (none) + // Amplitude(s) for diagram number 935 + FFV1_0( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 935 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram936( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 936 OF 1240 *** + // Wavefunction(s) for diagram number 936 + // (none) + // Amplitude(s) for diagram number 936 + VVV1_0( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 936 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram937( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 937 OF 1240 *** + // Wavefunction(s) for diagram number 937 + // (none) + // Amplitude(s) for diagram number 937 + FFV1_0( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram938( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 938 OF 1240 *** + // Wavefunction(s) for diagram number 938 + // (none) + // Amplitude(s) for diagram number 938 + VVV1_0( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 938 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram939( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 939 OF 1240 *** + // Wavefunction(s) for diagram number 939 + // (none) + // Amplitude(s) for diagram number 939 + FFV1_0( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 939 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram940( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 940 OF 1240 *** + // Wavefunction(s) for diagram number 940 + // (none) + // Amplitude(s) for diagram number 940 + FFV1_0( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 940 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram941( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 941 OF 1240 *** + // Wavefunction(s) for diagram number 941 + // (none) + // Amplitude(s) for diagram number 941 + FFV1_0( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 941 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram942( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 942 OF 1240 *** + // Wavefunction(s) for diagram number 942 + // (none) + // Amplitude(s) for diagram number 942 + FFV1_0( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 942 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram943( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 943 OF 1240 *** + // Wavefunction(s) for diagram number 943 + // (none) + // Amplitude(s) for diagram number 943 + VVV1_0( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 943 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram944( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 944 OF 1240 *** + // Wavefunction(s) for diagram number 944 + // (none) + // Amplitude(s) for diagram number 944 + FFV1_0( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram945( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 945 OF 1240 *** + // Wavefunction(s) for diagram number 945 + // (none) + // Amplitude(s) for diagram number 945 + FFV1_0( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 945 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram946( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 946 OF 1240 *** + // Wavefunction(s) for diagram number 946 + // (none) + // Amplitude(s) for diagram number 946 + FFV1_0( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 946 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram947( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 947 OF 1240 *** + // Wavefunction(s) for diagram number 947 + // (none) + // Amplitude(s) for diagram number 947 + FFV1_0( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 947 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram948( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 948 OF 1240 *** + // Wavefunction(s) for diagram number 948 + // (none) + // Amplitude(s) for diagram number 948 + FFV1_0( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 948 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram949( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 949 OF 1240 *** + // Wavefunction(s) for diagram number 949 + // (none) + // Amplitude(s) for diagram number 949 + FFV1_0( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 949 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram950( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 950 OF 1240 *** + // Wavefunction(s) for diagram number 950 + // (none) + // Amplitude(s) for diagram number 950 + FFV1_0( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 950 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram951( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 951 OF 1240 *** + // Wavefunction(s) for diagram number 951 + VVV1P0_1( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] ); + // Amplitude(s) for diagram number 951 + VVV1_0( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 951 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram952( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 952 OF 1240 *** + // Wavefunction(s) for diagram number 952 + // (none) + // Amplitude(s) for diagram number 952 + VVV1_0( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 952 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram953( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 953 OF 1240 *** + // Wavefunction(s) for diagram number 953 + // (none) + // Amplitude(s) for diagram number 953 + VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram954( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 954 OF 1240 *** + // Wavefunction(s) for diagram number 954 + // (none) + // Amplitude(s) for diagram number 954 + VVV1_0( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 954 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram955( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 955 OF 1240 *** + // Wavefunction(s) for diagram number 955 + // (none) + // Amplitude(s) for diagram number 955 + VVV1_0( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 955 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram956( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 956 OF 1240 *** + // Wavefunction(s) for diagram number 956 + // (none) + // Amplitude(s) for diagram number 956 + VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram957( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 957 OF 1240 *** + // Wavefunction(s) for diagram number 957 + // (none) + // Amplitude(s) for diagram number 957 + VVV1_0( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 957 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram958( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 958 OF 1240 *** + // Wavefunction(s) for diagram number 958 + // (none) + // Amplitude(s) for diagram number 958 + VVV1_0( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 958 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram959( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 959 OF 1240 *** + // Wavefunction(s) for diagram number 959 + VVVV1P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] ); + VVVV3P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] ); + VVVV4P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 959 + VVV1_0( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram960( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 960 OF 1240 *** + // Wavefunction(s) for diagram number 960 + VVVV1P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] ); + VVVV3P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] ); + VVVV4P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] ); + // Amplitude(s) for diagram number 960 + VVV1_0( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram961( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 961 OF 1240 *** + // Wavefunction(s) for diagram number 961 + // (none) + // Amplitude(s) for diagram number 961 + VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVV1_0( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + VVV1_0( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram962( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 962 OF 1240 *** + // Wavefunction(s) for diagram number 962 + // (none) + // Amplitude(s) for diagram number 962 + VVV1_0( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVV1_0( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + VVV1_0( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram963( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 963 OF 1240 *** + // Wavefunction(s) for diagram number 963 + // (none) + // Amplitude(s) for diagram number 963 + VVVV1_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram964( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 964 OF 1240 *** + // Wavefunction(s) for diagram number 964 + // (none) + // Amplitude(s) for diagram number 964 + VVV1_0( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 964 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram965( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 965 OF 1240 *** + // Wavefunction(s) for diagram number 965 + // (none) + // Amplitude(s) for diagram number 965 + VVV1_0( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 965 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram966( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 966 OF 1240 *** + // Wavefunction(s) for diagram number 966 + // (none) + // Amplitude(s) for diagram number 966 + VVV1_0( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 966 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram967( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 967 OF 1240 *** + // Wavefunction(s) for diagram number 967 + // (none) + // Amplitude(s) for diagram number 967 + VVV1_0( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 967 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram968( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 968 OF 1240 *** + // Wavefunction(s) for diagram number 968 + // (none) + // Amplitude(s) for diagram number 968 + FFV1_0( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 968 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram969( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 969 OF 1240 *** + // Wavefunction(s) for diagram number 969 + // (none) + // Amplitude(s) for diagram number 969 + FFV1_0( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 969 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram970( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 970 OF 1240 *** + // Wavefunction(s) for diagram number 970 + // (none) + // Amplitude(s) for diagram number 970 + FFV1_0( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 970 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram971( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 971 OF 1240 *** + // Wavefunction(s) for diagram number 971 + // (none) + // Amplitude(s) for diagram number 971 + FFV1_0( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 971 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram972( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 972 OF 1240 *** + // Wavefunction(s) for diagram number 972 + // (none) + // Amplitude(s) for diagram number 972 + VVV1_0( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 972 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram973( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 973 OF 1240 *** + // Wavefunction(s) for diagram number 973 + // (none) + // Amplitude(s) for diagram number 973 + FFV1_0( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram974( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 974 OF 1240 *** + // Wavefunction(s) for diagram number 974 + // (none) + // Amplitude(s) for diagram number 974 + FFV1_0( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 974 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram975( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 975 OF 1240 *** + // Wavefunction(s) for diagram number 975 + // (none) + // Amplitude(s) for diagram number 975 + FFV1_0( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 975 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram976( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 976 OF 1240 *** + // Wavefunction(s) for diagram number 976 + // (none) + // Amplitude(s) for diagram number 976 + FFV1_0( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 976 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram977( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 977 OF 1240 *** + // Wavefunction(s) for diagram number 977 + // (none) + // Amplitude(s) for diagram number 977 + VVV1_0( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 977 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram978( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 978 OF 1240 *** + // Wavefunction(s) for diagram number 978 + // (none) + // Amplitude(s) for diagram number 978 + FFV1_0( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 978 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram979( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 979 OF 1240 *** + // Wavefunction(s) for diagram number 979 + // (none) + // Amplitude(s) for diagram number 979 + FFV1_0( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 979 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram980( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 980 OF 1240 *** + // Wavefunction(s) for diagram number 980 + // (none) + // Amplitude(s) for diagram number 980 + FFV1_0( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 980 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram981( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 981 OF 1240 *** + // Wavefunction(s) for diagram number 981 + // (none) + // Amplitude(s) for diagram number 981 + FFV1_0( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 981 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram982( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 982 OF 1240 *** + // Wavefunction(s) for diagram number 982 + // (none) + // Amplitude(s) for diagram number 982 + VVV1_0( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 982 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram983( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 983 OF 1240 *** + // Wavefunction(s) for diagram number 983 + // (none) + // Amplitude(s) for diagram number 983 + FFV1_0( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram984( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 984 OF 1240 *** + // Wavefunction(s) for diagram number 984 + // (none) + // Amplitude(s) for diagram number 984 + FFV1_0( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 984 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram985( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 985 OF 1240 *** + // Wavefunction(s) for diagram number 985 + // (none) + // Amplitude(s) for diagram number 985 + FFV1_0( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 985 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram986( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 986 OF 1240 *** + // Wavefunction(s) for diagram number 986 + // (none) + // Amplitude(s) for diagram number 986 + FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 986 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram987( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 987 OF 1240 *** + // Wavefunction(s) for diagram number 987 + // (none) + // Amplitude(s) for diagram number 987 + VVV1_0( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 987 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram988( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 988 OF 1240 *** + // Wavefunction(s) for diagram number 988 + // (none) + // Amplitude(s) for diagram number 988 + FFV1_0( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 988 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram989( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 989 OF 1240 *** + // Wavefunction(s) for diagram number 989 + // (none) + // Amplitude(s) for diagram number 989 + FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 989 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram990( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 990 OF 1240 *** + // Wavefunction(s) for diagram number 990 + // (none) + // Amplitude(s) for diagram number 990 + FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 990 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram991( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 991 OF 1240 *** + // Wavefunction(s) for diagram number 991 + // (none) + // Amplitude(s) for diagram number 991 + FFV1_0( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 991 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram992( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 992 OF 1240 *** + // Wavefunction(s) for diagram number 992 + // (none) + // Amplitude(s) for diagram number 992 + VVV1_0( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 992 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram993( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 993 OF 1240 *** + // Wavefunction(s) for diagram number 993 + // (none) + // Amplitude(s) for diagram number 993 + FFV1_0( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram994( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 994 OF 1240 *** + // Wavefunction(s) for diagram number 994 + // (none) + // Amplitude(s) for diagram number 994 + VVV1_0( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 994 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram995( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 995 OF 1240 *** + // Wavefunction(s) for diagram number 995 + // (none) + // Amplitude(s) for diagram number 995 + FFV1_0( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 995 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram996( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 996 OF 1240 *** + // Wavefunction(s) for diagram number 996 + // (none) + // Amplitude(s) for diagram number 996 + FFV1_0( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 996 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram997( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 997 OF 1240 *** + // Wavefunction(s) for diagram number 997 + // (none) + // Amplitude(s) for diagram number 997 + FFV1_0( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 997 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram998( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 998 OF 1240 *** + // Wavefunction(s) for diagram number 998 + // (none) + // Amplitude(s) for diagram number 998 + FFV1_0( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 998 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram999( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 999 OF 1240 *** + // Wavefunction(s) for diagram number 999 + // (none) + // Amplitude(s) for diagram number 999 + VVV1_0( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 999 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1000( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1000 OF 1240 *** + // Wavefunction(s) for diagram number 1000 + // (none) + // Amplitude(s) for diagram number 1000 + FFV1_0( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1001( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1001 OF 1240 *** + // Wavefunction(s) for diagram number 1001 + // (none) + // Amplitude(s) for diagram number 1001 + FFV1_0( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1001 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1002( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1002 OF 1240 *** + // Wavefunction(s) for diagram number 1002 + // (none) + // Amplitude(s) for diagram number 1002 + FFV1_0( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1002 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1003( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1003 OF 1240 *** + // Wavefunction(s) for diagram number 1003 + // (none) + // Amplitude(s) for diagram number 1003 + FFV1_0( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1003 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1004( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1004 OF 1240 *** + // Wavefunction(s) for diagram number 1004 + // (none) + // Amplitude(s) for diagram number 1004 + FFV1_0( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1004 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1005( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1005 OF 1240 *** + // Wavefunction(s) for diagram number 1005 + // (none) + // Amplitude(s) for diagram number 1005 + FFV1_0( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1005 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1006( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1006 OF 1240 *** + // Wavefunction(s) for diagram number 1006 + // (none) + // Amplitude(s) for diagram number 1006 + FFV1_0( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1006 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1007( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1007 OF 1240 *** + // Wavefunction(s) for diagram number 1007 + // (none) + // Amplitude(s) for diagram number 1007 + VVV1_0( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1007 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1008( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1008 OF 1240 *** + // Wavefunction(s) for diagram number 1008 + // (none) + // Amplitude(s) for diagram number 1008 + VVV1_0( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1008 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1009( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1009 OF 1240 *** + // Wavefunction(s) for diagram number 1009 + // (none) + // Amplitude(s) for diagram number 1009 + VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1010( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1010 OF 1240 *** + // Wavefunction(s) for diagram number 1010 + // (none) + // Amplitude(s) for diagram number 1010 + VVV1_0( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1010 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1011( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1011 OF 1240 *** + // Wavefunction(s) for diagram number 1011 + // (none) + // Amplitude(s) for diagram number 1011 + VVV1_0( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1011 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1012( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1012 OF 1240 *** + // Wavefunction(s) for diagram number 1012 + // (none) + // Amplitude(s) for diagram number 1012 + VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1013( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1013 OF 1240 *** + // Wavefunction(s) for diagram number 1013 + // (none) + // Amplitude(s) for diagram number 1013 + VVV1_0( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1013 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1014( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1014 OF 1240 *** + // Wavefunction(s) for diagram number 1014 + // (none) + // Amplitude(s) for diagram number 1014 + VVV1_0( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1014 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1015( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1015 OF 1240 *** + // Wavefunction(s) for diagram number 1015 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] ); + // Amplitude(s) for diagram number 1015 + VVV1_0( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVV1_0( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVV1_0( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1016( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1016 OF 1240 *** + // Wavefunction(s) for diagram number 1016 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 1016 + VVV1_0( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1017( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1017 OF 1240 *** + // Wavefunction(s) for diagram number 1017 + // (none) + // Amplitude(s) for diagram number 1017 + VVV1_0( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1018( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1018 OF 1240 *** + // Wavefunction(s) for diagram number 1018 + // (none) + // Amplitude(s) for diagram number 1018 + VVV1_0( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1019( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1019 OF 1240 *** + // Wavefunction(s) for diagram number 1019 + // (none) + // Amplitude(s) for diagram number 1019 + VVV1_0( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1019 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1020( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1020 OF 1240 *** + // Wavefunction(s) for diagram number 1020 + // (none) + // Amplitude(s) for diagram number 1020 + VVV1_0( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1020 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1021( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1021 OF 1240 *** + // Wavefunction(s) for diagram number 1021 + // (none) + // Amplitude(s) for diagram number 1021 + VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1022( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1022 OF 1240 *** + // Wavefunction(s) for diagram number 1022 + // (none) + // Amplitude(s) for diagram number 1022 + VVV1_0( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1022 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1023( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1023 OF 1240 *** + // Wavefunction(s) for diagram number 1023 + // (none) + // Amplitude(s) for diagram number 1023 + VVV1_0( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1023 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1024( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1024 OF 1240 *** + // Wavefunction(s) for diagram number 1024 + // (none) + // Amplitude(s) for diagram number 1024 + VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1025( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1025 OF 1240 *** + // Wavefunction(s) for diagram number 1025 + // (none) + // Amplitude(s) for diagram number 1025 + VVV1_0( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1025 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1026( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1026 OF 1240 *** + // Wavefunction(s) for diagram number 1026 + // (none) + // Amplitude(s) for diagram number 1026 + VVV1_0( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1026 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1027( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1027 OF 1240 *** + // Wavefunction(s) for diagram number 1027 + // (none) + // Amplitude(s) for diagram number 1027 + VVV1_0( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVV1_0( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVV1_0( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1028( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1028 OF 1240 *** + // Wavefunction(s) for diagram number 1028 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 1028 + VVV1_0( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1029( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1029 OF 1240 *** + // Wavefunction(s) for diagram number 1029 + // (none) + // Amplitude(s) for diagram number 1029 + VVV1_0( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1030( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1030 OF 1240 *** + // Wavefunction(s) for diagram number 1030 + // (none) + // Amplitude(s) for diagram number 1030 + VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1031( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1031 OF 1240 *** + // Wavefunction(s) for diagram number 1031 + // (none) + // Amplitude(s) for diagram number 1031 + VVV1_0( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1031 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1032( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1032 OF 1240 *** + // Wavefunction(s) for diagram number 1032 + // (none) + // Amplitude(s) for diagram number 1032 + VVV1_0( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1032 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1033( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1033 OF 1240 *** + // Wavefunction(s) for diagram number 1033 + // (none) + // Amplitude(s) for diagram number 1033 + VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1034( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1034 OF 1240 *** + // Wavefunction(s) for diagram number 1034 + // (none) + // Amplitude(s) for diagram number 1034 + VVV1_0( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1034 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1035( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1035 OF 1240 *** + // Wavefunction(s) for diagram number 1035 + // (none) + // Amplitude(s) for diagram number 1035 + VVV1_0( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1035 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1036( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1036 OF 1240 *** + // Wavefunction(s) for diagram number 1036 + // (none) + // Amplitude(s) for diagram number 1036 + VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1037( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1037 OF 1240 *** + // Wavefunction(s) for diagram number 1037 + // (none) + // Amplitude(s) for diagram number 1037 + VVV1_0( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1037 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1038( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1038 OF 1240 *** + // Wavefunction(s) for diagram number 1038 + // (none) + // Amplitude(s) for diagram number 1038 + VVV1_0( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1038 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1039( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1039 OF 1240 *** + // Wavefunction(s) for diagram number 1039 + // (none) + // Amplitude(s) for diagram number 1039 + VVV1_0( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVV1_0( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVV1_0( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1040( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1040 OF 1240 *** + // Wavefunction(s) for diagram number 1040 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 1040 + VVV1_0( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1041( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1041 OF 1240 *** + // Wavefunction(s) for diagram number 1041 + // (none) + // Amplitude(s) for diagram number 1041 + VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1042( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1042 OF 1240 *** + // Wavefunction(s) for diagram number 1042 + // (none) + // Amplitude(s) for diagram number 1042 + VVV1_0( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1043( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1043 OF 1240 *** + // Wavefunction(s) for diagram number 1043 + // (none) + // Amplitude(s) for diagram number 1043 + VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1044( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1044 OF 1240 *** + // Wavefunction(s) for diagram number 1044 + // (none) + // Amplitude(s) for diagram number 1044 + VVV1_0( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1045( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1045 OF 1240 *** + // Wavefunction(s) for diagram number 1045 + // (none) + // Amplitude(s) for diagram number 1045 + VVV1_0( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1046( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1046 OF 1240 *** + // Wavefunction(s) for diagram number 1046 + // (none) + // Amplitude(s) for diagram number 1046 + FFV1_0( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1046 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1047( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1047 OF 1240 *** + // Wavefunction(s) for diagram number 1047 + // (none) + // Amplitude(s) for diagram number 1047 + FFV1_0( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1047 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1048( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1048 OF 1240 *** + // Wavefunction(s) for diagram number 1048 + // (none) + // Amplitude(s) for diagram number 1048 + FFV1_0( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1048 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1049( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1049 OF 1240 *** + // Wavefunction(s) for diagram number 1049 + // (none) + // Amplitude(s) for diagram number 1049 + FFV1_0( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1049 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1050( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1050 OF 1240 *** + // Wavefunction(s) for diagram number 1050 + // (none) + // Amplitude(s) for diagram number 1050 + FFV1_0( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1050 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1051( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1051 OF 1240 *** + // Wavefunction(s) for diagram number 1051 + // (none) + // Amplitude(s) for diagram number 1051 + FFV1_0( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1051 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1052( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1052 OF 1240 *** + // Wavefunction(s) for diagram number 1052 + // (none) + // Amplitude(s) for diagram number 1052 + FFV1_0( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1052 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1053( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1053 OF 1240 *** + // Wavefunction(s) for diagram number 1053 + // (none) + // Amplitude(s) for diagram number 1053 + FFV1_0( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1053 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1054( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1054 OF 1240 *** + // Wavefunction(s) for diagram number 1054 + // (none) + // Amplitude(s) for diagram number 1054 + FFV1_0( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1054 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1055( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1055 OF 1240 *** + // Wavefunction(s) for diagram number 1055 + // (none) + // Amplitude(s) for diagram number 1055 + FFV1_0( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1055 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1056( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1056 OF 1240 *** + // Wavefunction(s) for diagram number 1056 + // (none) + // Amplitude(s) for diagram number 1056 + FFV1_0( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1056 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1057( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1057 OF 1240 *** + // Wavefunction(s) for diagram number 1057 + // (none) + // Amplitude(s) for diagram number 1057 + FFV1_0( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1057 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1058( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1058 OF 1240 *** + // Wavefunction(s) for diagram number 1058 + // (none) + // Amplitude(s) for diagram number 1058 + FFV1_0( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1058 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1059( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1059 OF 1240 *** + // Wavefunction(s) for diagram number 1059 + // (none) + // Amplitude(s) for diagram number 1059 + FFV1_0( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1059 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1060( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1060 OF 1240 *** + // Wavefunction(s) for diagram number 1060 + // (none) + // Amplitude(s) for diagram number 1060 + FFV1_0( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1060 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1061( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1061 OF 1240 *** + // Wavefunction(s) for diagram number 1061 + // (none) + // Amplitude(s) for diagram number 1061 + VVV1_0( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1061 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1062( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1062 OF 1240 *** + // Wavefunction(s) for diagram number 1062 + // (none) + // Amplitude(s) for diagram number 1062 + FFV1_0( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1062 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1063( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1063 OF 1240 *** + // Wavefunction(s) for diagram number 1063 + // (none) + // Amplitude(s) for diagram number 1063 + VVV1_0( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1063 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1064( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1064 OF 1240 *** + // Wavefunction(s) for diagram number 1064 + // (none) + // Amplitude(s) for diagram number 1064 + FFV1_0( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1065( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1065 OF 1240 *** + // Wavefunction(s) for diagram number 1065 + // (none) + // Amplitude(s) for diagram number 1065 + FFV1_0( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1065 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1066( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1066 OF 1240 *** + // Wavefunction(s) for diagram number 1066 + // (none) + // Amplitude(s) for diagram number 1066 + FFV1_0( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1066 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1067( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1067 OF 1240 *** + // Wavefunction(s) for diagram number 1067 + // (none) + // Amplitude(s) for diagram number 1067 + FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1067 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1068( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1068 OF 1240 *** + // Wavefunction(s) for diagram number 1068 + // (none) + // Amplitude(s) for diagram number 1068 + FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1068 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1069( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1069 OF 1240 *** + // Wavefunction(s) for diagram number 1069 + // (none) + // Amplitude(s) for diagram number 1069 + FFV1_0( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1069 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1070( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1070 OF 1240 *** + // Wavefunction(s) for diagram number 1070 + // (none) + // Amplitude(s) for diagram number 1070 + FFV1_0( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1070 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1071( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1071 OF 1240 *** + // Wavefunction(s) for diagram number 1071 + // (none) + // Amplitude(s) for diagram number 1071 + FFV1_0( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1071 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1072( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1072 OF 1240 *** + // Wavefunction(s) for diagram number 1072 + // (none) + // Amplitude(s) for diagram number 1072 + FFV1_0( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1072 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1073( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1073 OF 1240 *** + // Wavefunction(s) for diagram number 1073 + // (none) + // Amplitude(s) for diagram number 1073 + FFV1_0( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1073 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1074( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1074 OF 1240 *** + // Wavefunction(s) for diagram number 1074 + // (none) + // Amplitude(s) for diagram number 1074 + FFV1_0( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1074 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1075( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1075 OF 1240 *** + // Wavefunction(s) for diagram number 1075 + // (none) + // Amplitude(s) for diagram number 1075 + FFV1_0( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1075 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1076( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1076 OF 1240 *** + // Wavefunction(s) for diagram number 1076 + // (none) + // Amplitude(s) for diagram number 1076 + FFV1_0( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1076 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1077( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1077 OF 1240 *** + // Wavefunction(s) for diagram number 1077 + // (none) + // Amplitude(s) for diagram number 1077 + FFV1_0( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1077 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1078( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1078 OF 1240 *** + // Wavefunction(s) for diagram number 1078 + // (none) + // Amplitude(s) for diagram number 1078 + FFV1_0( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1078 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1079( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1079 OF 1240 *** + // Wavefunction(s) for diagram number 1079 + // (none) + // Amplitude(s) for diagram number 1079 + FFV1_0( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1079 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1080( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1080 OF 1240 *** + // Wavefunction(s) for diagram number 1080 + // (none) + // Amplitude(s) for diagram number 1080 + VVV1_0( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1080 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1081( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1081 OF 1240 *** + // Wavefunction(s) for diagram number 1081 + // (none) + // Amplitude(s) for diagram number 1081 + FFV1_0( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1081 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1082( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1082 OF 1240 *** + // Wavefunction(s) for diagram number 1082 + // (none) + // Amplitude(s) for diagram number 1082 + VVV1_0( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1082 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1083( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1083 OF 1240 *** + // Wavefunction(s) for diagram number 1083 + // (none) + // Amplitude(s) for diagram number 1083 + FFV1_0( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1084( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1084 OF 1240 *** + // Wavefunction(s) for diagram number 1084 + // (none) + // Amplitude(s) for diagram number 1084 + FFV1_0( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1084 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1085( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1085 OF 1240 *** + // Wavefunction(s) for diagram number 1085 + // (none) + // Amplitude(s) for diagram number 1085 + FFV1_0( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1085 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1086( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1086 OF 1240 *** + // Wavefunction(s) for diagram number 1086 + // (none) + // Amplitude(s) for diagram number 1086 + FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1086 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1087( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1087 OF 1240 *** + // Wavefunction(s) for diagram number 1087 + // (none) + // Amplitude(s) for diagram number 1087 + FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1087 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1088( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1088 OF 1240 *** + // Wavefunction(s) for diagram number 1088 + // (none) + // Amplitude(s) for diagram number 1088 + FFV1_0( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1088 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1089( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1089 OF 1240 *** + // Wavefunction(s) for diagram number 1089 + // (none) + // Amplitude(s) for diagram number 1089 + FFV1_0( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1089 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1090( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1090 OF 1240 *** + // Wavefunction(s) for diagram number 1090 + // (none) + // Amplitude(s) for diagram number 1090 + FFV1_0( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1090 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1091( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1091 OF 1240 *** + // Wavefunction(s) for diagram number 1091 + // (none) + // Amplitude(s) for diagram number 1091 + FFV1_0( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1091 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1092( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1092 OF 1240 *** + // Wavefunction(s) for diagram number 1092 + // (none) + // Amplitude(s) for diagram number 1092 + FFV1_0( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1092 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1093( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1093 OF 1240 *** + // Wavefunction(s) for diagram number 1093 + // (none) + // Amplitude(s) for diagram number 1093 + FFV1_0( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1093 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1094( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1094 OF 1240 *** + // Wavefunction(s) for diagram number 1094 + // (none) + // Amplitude(s) for diagram number 1094 + FFV1_0( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1094 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1095( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1095 OF 1240 *** + // Wavefunction(s) for diagram number 1095 + // (none) + // Amplitude(s) for diagram number 1095 + FFV1_0( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1095 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1096( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1096 OF 1240 *** + // Wavefunction(s) for diagram number 1096 + // (none) + // Amplitude(s) for diagram number 1096 + FFV1_0( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1096 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1097( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1097 OF 1240 *** + // Wavefunction(s) for diagram number 1097 + // (none) + // Amplitude(s) for diagram number 1097 + FFV1_0( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1097 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1098( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1098 OF 1240 *** + // Wavefunction(s) for diagram number 1098 + // (none) + // Amplitude(s) for diagram number 1098 + FFV1_0( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1098 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1099( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1099 OF 1240 *** + // Wavefunction(s) for diagram number 1099 + // (none) + // Amplitude(s) for diagram number 1099 + VVV1_0( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1099 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1100( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1100 OF 1240 *** + // Wavefunction(s) for diagram number 1100 + // (none) + // Amplitude(s) for diagram number 1100 + FFV1_0( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1100 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1101( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1101 OF 1240 *** + // Wavefunction(s) for diagram number 1101 + // (none) + // Amplitude(s) for diagram number 1101 + VVV1_0( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1101 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1102( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1102 OF 1240 *** + // Wavefunction(s) for diagram number 1102 + // (none) + // Amplitude(s) for diagram number 1102 + FFV1_0( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1103( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1103 OF 1240 *** + // Wavefunction(s) for diagram number 1103 + // (none) + // Amplitude(s) for diagram number 1103 + FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1103 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1104( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1104 OF 1240 *** + // Wavefunction(s) for diagram number 1104 + // (none) + // Amplitude(s) for diagram number 1104 + FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1104 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1105( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1105 OF 1240 *** + // Wavefunction(s) for diagram number 1105 + // (none) + // Amplitude(s) for diagram number 1105 + FFV1_0( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1105 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1106( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1106 OF 1240 *** + // Wavefunction(s) for diagram number 1106 + // (none) + // Amplitude(s) for diagram number 1106 + VVV1_0( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1106 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1107( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1107 OF 1240 *** + // Wavefunction(s) for diagram number 1107 + // (none) + // Amplitude(s) for diagram number 1107 + FFV1_0( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1107 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1108( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1108 OF 1240 *** + // Wavefunction(s) for diagram number 1108 + // (none) + // Amplitude(s) for diagram number 1108 + VVV1_0( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1108 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1109( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1109 OF 1240 *** + // Wavefunction(s) for diagram number 1109 + // (none) + // Amplitude(s) for diagram number 1109 + FFV1_0( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1110( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1110 OF 1240 *** + // Wavefunction(s) for diagram number 1110 + // (none) + // Amplitude(s) for diagram number 1110 + FFV1_0( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1110 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1111( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1111 OF 1240 *** + // Wavefunction(s) for diagram number 1111 + // (none) + // Amplitude(s) for diagram number 1111 + FFV1_0( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1111 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1112( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1112 OF 1240 *** + // Wavefunction(s) for diagram number 1112 + // (none) + // Amplitude(s) for diagram number 1112 + FFV1_0( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1112 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1113( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1113 OF 1240 *** + // Wavefunction(s) for diagram number 1113 + // (none) + // Amplitude(s) for diagram number 1113 + VVV1_0( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1113 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1114( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1114 OF 1240 *** + // Wavefunction(s) for diagram number 1114 + // (none) + // Amplitude(s) for diagram number 1114 + FFV1_0( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1114 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1115( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1115 OF 1240 *** + // Wavefunction(s) for diagram number 1115 + // (none) + // Amplitude(s) for diagram number 1115 + VVV1_0( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1115 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1116( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1116 OF 1240 *** + // Wavefunction(s) for diagram number 1116 + // (none) + // Amplitude(s) for diagram number 1116 + FFV1_0( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1117( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1117 OF 1240 *** + // Wavefunction(s) for diagram number 1117 + // (none) + // Amplitude(s) for diagram number 1117 + FFV1_0( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1117 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1118( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1118 OF 1240 *** + // Wavefunction(s) for diagram number 1118 + // (none) + // Amplitude(s) for diagram number 1118 + FFV1_0( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1118 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1119( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1119 OF 1240 *** + // Wavefunction(s) for diagram number 1119 + // (none) + // Amplitude(s) for diagram number 1119 + FFV1_0( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1119 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1120( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1120 OF 1240 *** + // Wavefunction(s) for diagram number 1120 + // (none) + // Amplitude(s) for diagram number 1120 + VVV1_0( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1120 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1121( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1121 OF 1240 *** + // Wavefunction(s) for diagram number 1121 + // (none) + // Amplitude(s) for diagram number 1121 + FFV1_0( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1121 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1122( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1122 OF 1240 *** + // Wavefunction(s) for diagram number 1122 + // (none) + // Amplitude(s) for diagram number 1122 + VVV1_0( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1122 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1123( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1123 OF 1240 *** + // Wavefunction(s) for diagram number 1123 + // (none) + // Amplitude(s) for diagram number 1123 + FFV1_0( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1124( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1124 OF 1240 *** + // Wavefunction(s) for diagram number 1124 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] ); + // Amplitude(s) for diagram number 1124 + VVVV1_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVVV1_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV3_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV4_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVVV1_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV3_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1125( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1125 OF 1240 *** + // Wavefunction(s) for diagram number 1125 + VVV1P0_1( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] ); + VVV1P0_1( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); + VVV1P0_1( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] ); + // Amplitude(s) for diagram number 1125 + VVV1_0( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1126( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1126 OF 1240 *** + // Wavefunction(s) for diagram number 1126 + VVV1P0_1( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] ); + VVV1P0_1( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] ); + VVV1P0_1( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 1126 + VVV1_0( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1127( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1127 OF 1240 *** + // Wavefunction(s) for diagram number 1127 + // (none) + // Amplitude(s) for diagram number 1127 + VVV1_0( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1128( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1128 OF 1240 *** + // Wavefunction(s) for diagram number 1128 + FFV1_2( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_2( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); + FFV1_2( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); + // Amplitude(s) for diagram number 1128 + FFV1_0( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + FFV1_0( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + FFV1_0( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1129( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1129 OF 1240 *** + // Wavefunction(s) for diagram number 1129 + // (none) + // Amplitude(s) for diagram number 1129 + FFV1_0( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1130( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1130 OF 1240 *** + // Wavefunction(s) for diagram number 1130 + // (none) + // Amplitude(s) for diagram number 1130 + FFV1_0( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1131( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1131 OF 1240 *** + // Wavefunction(s) for diagram number 1131 + // (none) + // Amplitude(s) for diagram number 1131 + FFV1_0( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + FFV1_0( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + FFV1_0( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1132( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1132 OF 1240 *** + // Wavefunction(s) for diagram number 1132 + // (none) + // Amplitude(s) for diagram number 1132 + FFV1_0( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1133( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1133 OF 1240 *** + // Wavefunction(s) for diagram number 1133 + // (none) + // Amplitude(s) for diagram number 1133 + FFV1_0( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1134( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1134 OF 1240 *** + // Wavefunction(s) for diagram number 1134 + FFV1_1( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + FFV1_1( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + FFV1_1( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); + // Amplitude(s) for diagram number 1134 + FFV1_0( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1135( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1135 OF 1240 *** + // Wavefunction(s) for diagram number 1135 + // (none) + // Amplitude(s) for diagram number 1135 + FFV1_0( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1136( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1136 OF 1240 *** + // Wavefunction(s) for diagram number 1136 + // (none) + // Amplitude(s) for diagram number 1136 + FFV1_0( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1137( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1137 OF 1240 *** + // Wavefunction(s) for diagram number 1137 + // (none) + // Amplitude(s) for diagram number 1137 + FFV1_0( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1138( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1138 OF 1240 *** + // Wavefunction(s) for diagram number 1138 + // (none) + // Amplitude(s) for diagram number 1138 + FFV1_0( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1139( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1139 OF 1240 *** + // Wavefunction(s) for diagram number 1139 + // (none) + // Amplitude(s) for diagram number 1139 + FFV1_0( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1140( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1140 OF 1240 *** + // Wavefunction(s) for diagram number 1140 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 1140 + VVVV1_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV3_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVVV4_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVVV1_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVVV3_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV4_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV1_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + VVVV3_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV4_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1141( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1141 OF 1240 *** + // Wavefunction(s) for diagram number 1141 + VVV1P0_1( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] ); + VVV1P0_1( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] ); + VVV1P0_1( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 1141 + VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1142( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1142 OF 1240 *** + // Wavefunction(s) for diagram number 1142 + VVV1P0_1( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] ); + VVV1P0_1( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] ); + VVV1P0_1( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] ); + // Amplitude(s) for diagram number 1142 + VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1143( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1143 OF 1240 *** + // Wavefunction(s) for diagram number 1143 + // (none) + // Amplitude(s) for diagram number 1143 + VVV1_0( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVV1_0( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVV1_0( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1144( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1144 OF 1240 *** + // Wavefunction(s) for diagram number 1144 + FFV1_2( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); + FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] ); + FFV1_2( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + // Amplitude(s) for diagram number 1144 + FFV1_0( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + FFV1_0( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + FFV1_0( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1145( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1145 OF 1240 *** + // Wavefunction(s) for diagram number 1145 + // (none) + // Amplitude(s) for diagram number 1145 + FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1146( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1146 OF 1240 *** + // Wavefunction(s) for diagram number 1146 + // (none) + // Amplitude(s) for diagram number 1146 + FFV1_0( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1147( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1147 OF 1240 *** + // Wavefunction(s) for diagram number 1147 + // (none) + // Amplitude(s) for diagram number 1147 + FFV1_0( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + FFV1_0( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + FFV1_0( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1148( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1148 OF 1240 *** + // Wavefunction(s) for diagram number 1148 + // (none) + // Amplitude(s) for diagram number 1148 + FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1149( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1149 OF 1240 *** + // Wavefunction(s) for diagram number 1149 + // (none) + // Amplitude(s) for diagram number 1149 + FFV1_0( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1150( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1150 OF 1240 *** + // Wavefunction(s) for diagram number 1150 + FFV1_1( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); + FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); + FFV1_1( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] ); + // Amplitude(s) for diagram number 1150 + FFV1_0( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1151( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1151 OF 1240 *** + // Wavefunction(s) for diagram number 1151 + // (none) + // Amplitude(s) for diagram number 1151 + FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1152( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1152 OF 1240 *** + // Wavefunction(s) for diagram number 1152 + // (none) + // Amplitude(s) for diagram number 1152 + FFV1_0( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1153( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1153 OF 1240 *** + // Wavefunction(s) for diagram number 1153 + // (none) + // Amplitude(s) for diagram number 1153 + FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1154( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1154 OF 1240 *** + // Wavefunction(s) for diagram number 1154 + // (none) + // Amplitude(s) for diagram number 1154 + FFV1_0( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1155( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1155 OF 1240 *** + // Wavefunction(s) for diagram number 1155 + // (none) + // Amplitude(s) for diagram number 1155 + FFV1_0( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1156( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1156 OF 1240 *** + // Wavefunction(s) for diagram number 1156 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 1156 + VVVV1_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVVV3_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + VVVV4_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + VVVV1_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + VVVV3_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + VVVV4_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + VVVV1_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + VVVV3_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + VVVV4_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1157( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1157 OF 1240 *** + // Wavefunction(s) for diagram number 1157 + VVV1P0_1( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] ); + VVV1P0_1( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] ); + VVV1P0_1( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] ); + // Amplitude(s) for diagram number 1157 + VVV1_0( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1158( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1158 OF 1240 *** + // Wavefunction(s) for diagram number 1158 + VVV1P0_1( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] ); + VVV1P0_1( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] ); + VVV1P0_1( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] ); + // Amplitude(s) for diagram number 1158 + VVV1_0( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1159( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1159 OF 1240 *** + // Wavefunction(s) for diagram number 1159 + // (none) + // Amplitude(s) for diagram number 1159 + VVV1_0( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVV1_0( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + VVV1_0( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1160( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1160 OF 1240 *** + // Wavefunction(s) for diagram number 1160 + FFV1_2( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_2( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + FFV1_2( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); + // Amplitude(s) for diagram number 1160 + FFV1_0( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + FFV1_0( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + FFV1_0( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1161( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1161 OF 1240 *** + // Wavefunction(s) for diagram number 1161 + // (none) + // Amplitude(s) for diagram number 1161 + FFV1_0( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1162( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1162 OF 1240 *** + // Wavefunction(s) for diagram number 1162 + // (none) + // Amplitude(s) for diagram number 1162 + FFV1_0( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1163( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1163 OF 1240 *** + // Wavefunction(s) for diagram number 1163 + // (none) + // Amplitude(s) for diagram number 1163 + FFV1_0( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + FFV1_0( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + FFV1_0( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1164( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1164 OF 1240 *** + // Wavefunction(s) for diagram number 1164 + // (none) + // Amplitude(s) for diagram number 1164 + FFV1_0( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1165( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1165 OF 1240 *** + // Wavefunction(s) for diagram number 1165 + // (none) + // Amplitude(s) for diagram number 1165 + FFV1_0( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1166( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1166 OF 1240 *** + // Wavefunction(s) for diagram number 1166 + FFV1_1( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + FFV1_1( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + FFV1_1( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); + // Amplitude(s) for diagram number 1166 + FFV1_0( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1167( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1167 OF 1240 *** + // Wavefunction(s) for diagram number 1167 + // (none) + // Amplitude(s) for diagram number 1167 + FFV1_0( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1168( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1168 OF 1240 *** + // Wavefunction(s) for diagram number 1168 + // (none) + // Amplitude(s) for diagram number 1168 + FFV1_0( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1169( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1169 OF 1240 *** + // Wavefunction(s) for diagram number 1169 + // (none) + // Amplitude(s) for diagram number 1169 + FFV1_0( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1170( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1170 OF 1240 *** + // Wavefunction(s) for diagram number 1170 + // (none) + // Amplitude(s) for diagram number 1170 + FFV1_0( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1171( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1171 OF 1240 *** + // Wavefunction(s) for diagram number 1171 + // (none) + // Amplitude(s) for diagram number 1171 + FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1172( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1172 OF 1240 *** + // Wavefunction(s) for diagram number 1172 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] ); + FFV1_2( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_2( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); + FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + // Amplitude(s) for diagram number 1172 + FFV1_0( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + FFV1_0( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1173( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1173 OF 1240 *** + // Wavefunction(s) for diagram number 1173 + VVV1P0_1( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] ); + VVV1P0_1( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] ); + VVV1P0_1( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] ); + // Amplitude(s) for diagram number 1173 + FFV1_0( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1174( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1174 OF 1240 *** + // Wavefunction(s) for diagram number 1174 + // (none) + // Amplitude(s) for diagram number 1174 + FFV1_0( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1175( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1175 OF 1240 *** + // Wavefunction(s) for diagram number 1175 + FFV1_1( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); + FFV1_1( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); + FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 1175 + FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1176( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1176 OF 1240 *** + // Wavefunction(s) for diagram number 1176 + // (none) + // Amplitude(s) for diagram number 1176 + FFV1_0( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1177( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1177 OF 1240 *** + // Wavefunction(s) for diagram number 1177 + // (none) + // Amplitude(s) for diagram number 1177 + FFV1_0( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1178( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1178 OF 1240 *** + // Wavefunction(s) for diagram number 1178 + // (none) + // Amplitude(s) for diagram number 1178 + FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1179( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1179 OF 1240 *** + // Wavefunction(s) for diagram number 1179 + // (none) + // Amplitude(s) for diagram number 1179 + FFV1_0( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1180( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1180 OF 1240 *** + // Wavefunction(s) for diagram number 1180 + // (none) + // Amplitude(s) for diagram number 1180 + VVV1_0( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVV1_0( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVV1_0( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1181( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1181 OF 1240 *** + // Wavefunction(s) for diagram number 1181 + // (none) + // Amplitude(s) for diagram number 1181 + VVVV1_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVVV3_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + VVVV4_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV1_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVVV3_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + VVVV4_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV1_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV3_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1182( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1182 OF 1240 *** + // Wavefunction(s) for diagram number 1182 + VVV1P0_1( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] ); + VVV1P0_1( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] ); + VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 1182 + VVV1_0( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1183( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1183 OF 1240 *** + // Wavefunction(s) for diagram number 1183 + // (none) + // Amplitude(s) for diagram number 1183 + VVV1_0( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1184( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1184 OF 1240 *** + // Wavefunction(s) for diagram number 1184 + // (none) + // Amplitude(s) for diagram number 1184 + FFV1_0( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1185( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1185 OF 1240 *** + // Wavefunction(s) for diagram number 1185 + // (none) + // Amplitude(s) for diagram number 1185 + FFV1_0( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + FFV1_0( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1186( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1186 OF 1240 *** + // Wavefunction(s) for diagram number 1186 + // (none) + // Amplitude(s) for diagram number 1186 + FFV1_0( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1187( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1187 OF 1240 *** + // Wavefunction(s) for diagram number 1187 + // (none) + // Amplitude(s) for diagram number 1187 + FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1188( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1188 OF 1240 *** + // Wavefunction(s) for diagram number 1188 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] ); + FFV1_2( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); + FFV1_2( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); + FFV1_2( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] ); + // Amplitude(s) for diagram number 1188 + FFV1_0( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + FFV1_0( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + FFV1_0( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1189( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1189 OF 1240 *** + // Wavefunction(s) for diagram number 1189 + VVV1P0_1( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] ); + VVV1P0_1( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] ); + VVV1P0_1( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] ); + // Amplitude(s) for diagram number 1189 + FFV1_0( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1190( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1190 OF 1240 *** + // Wavefunction(s) for diagram number 1190 + // (none) + // Amplitude(s) for diagram number 1190 + FFV1_0( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1191( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1191 OF 1240 *** + // Wavefunction(s) for diagram number 1191 + FFV1_1( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] ); + FFV1_1( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); + FFV1_1( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 1191 + FFV1_0( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1192( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1192 OF 1240 *** + // Wavefunction(s) for diagram number 1192 + // (none) + // Amplitude(s) for diagram number 1192 + FFV1_0( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1193( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1193 OF 1240 *** + // Wavefunction(s) for diagram number 1193 + // (none) + // Amplitude(s) for diagram number 1193 + FFV1_0( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1194( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1194 OF 1240 *** + // Wavefunction(s) for diagram number 1194 + // (none) + // Amplitude(s) for diagram number 1194 + FFV1_0( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1195( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1195 OF 1240 *** + // Wavefunction(s) for diagram number 1195 + // (none) + // Amplitude(s) for diagram number 1195 + FFV1_0( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1196( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1196 OF 1240 *** + // Wavefunction(s) for diagram number 1196 + // (none) + // Amplitude(s) for diagram number 1196 + VVV1_0( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVV1_0( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + VVV1_0( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1197( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1197 OF 1240 *** + // Wavefunction(s) for diagram number 1197 + // (none) + // Amplitude(s) for diagram number 1197 + VVVV1_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + VVVV3_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + VVVV4_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + VVVV1_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVVV3_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVVV4_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVVV1_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVVV3_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV4_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1198( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1198 OF 1240 *** + // Wavefunction(s) for diagram number 1198 + VVV1P0_1( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] ); + VVV1P0_1( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); + VVV1P0_1( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] ); + // Amplitude(s) for diagram number 1198 + VVV1_0( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1199( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1199 OF 1240 *** + // Wavefunction(s) for diagram number 1199 + // (none) + // Amplitude(s) for diagram number 1199 + VVV1_0( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1200( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1200 OF 1240 *** + // Wavefunction(s) for diagram number 1200 + // (none) + // Amplitude(s) for diagram number 1200 + FFV1_0( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1201( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1201 OF 1240 *** + // Wavefunction(s) for diagram number 1201 + // (none) + // Amplitude(s) for diagram number 1201 + FFV1_0( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + FFV1_0( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + FFV1_0( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1202( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1202 OF 1240 *** + // Wavefunction(s) for diagram number 1202 + // (none) + // Amplitude(s) for diagram number 1202 + FFV1_0( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1203( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1203 OF 1240 *** + // Wavefunction(s) for diagram number 1203 + // (none) + // Amplitude(s) for diagram number 1203 + FFV1_0( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1204( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1204 OF 1240 *** + // Wavefunction(s) for diagram number 1204 + VVVV1P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] ); + VVVV3P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] ); + VVVV4P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] ); + FFV1_2( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); + FFV1_2( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] ); + // Amplitude(s) for diagram number 1204 + FFV1_0( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + FFV1_0( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + FFV1_0( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1205( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1205 OF 1240 *** + // Wavefunction(s) for diagram number 1205 + VVV1P0_1( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] ); + VVV1P0_1( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] ); + VVV1P0_1( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 1205 + FFV1_0( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1206( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1206 OF 1240 *** + // Wavefunction(s) for diagram number 1206 + // (none) + // Amplitude(s) for diagram number 1206 + FFV1_0( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1207( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1207 OF 1240 *** + // Wavefunction(s) for diagram number 1207 + FFV1_1( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); + FFV1_1( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); + // Amplitude(s) for diagram number 1207 + FFV1_0( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1208( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1208 OF 1240 *** + // Wavefunction(s) for diagram number 1208 + // (none) + // Amplitude(s) for diagram number 1208 + FFV1_0( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1209( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1209 OF 1240 *** + // Wavefunction(s) for diagram number 1209 + // (none) + // Amplitude(s) for diagram number 1209 + FFV1_0( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1210( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1210 OF 1240 *** + // Wavefunction(s) for diagram number 1210 + // (none) + // Amplitude(s) for diagram number 1210 + FFV1_0( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1211( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1211 OF 1240 *** + // Wavefunction(s) for diagram number 1211 + // (none) + // Amplitude(s) for diagram number 1211 + FFV1_0( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1212( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1212 OF 1240 *** + // Wavefunction(s) for diagram number 1212 + // (none) + // Amplitude(s) for diagram number 1212 + VVV1_0( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1213( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1213 OF 1240 *** + // Wavefunction(s) for diagram number 1213 + // (none) + // Amplitude(s) for diagram number 1213 + VVVV1_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + VVVV3_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVVV4_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVVV1_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVVV3_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV4_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV1_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV3_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV4_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1214( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1214 OF 1240 *** + // Wavefunction(s) for diagram number 1214 + VVV1P0_1( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] ); + VVV1P0_1( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] ); + VVV1P0_1( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] ); + // Amplitude(s) for diagram number 1214 + VVV1_0( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1215( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1215 OF 1240 *** + // Wavefunction(s) for diagram number 1215 + // (none) + // Amplitude(s) for diagram number 1215 + VVV1_0( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1216( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1216 OF 1240 *** + // Wavefunction(s) for diagram number 1216 + // (none) + // Amplitude(s) for diagram number 1216 + FFV1_0( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1217( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1217 OF 1240 *** + // Wavefunction(s) for diagram number 1217 + // (none) + // Amplitude(s) for diagram number 1217 + FFV1_0( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + FFV1_0( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + FFV1_0( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1218( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1218 OF 1240 *** + // Wavefunction(s) for diagram number 1218 + // (none) + // Amplitude(s) for diagram number 1218 + FFV1_0( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1219( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1219 OF 1240 *** + // Wavefunction(s) for diagram number 1219 + // (none) + // Amplitude(s) for diagram number 1219 + FFV1_0( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1220( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1220 OF 1240 *** + // Wavefunction(s) for diagram number 1220 + // (none) + // Amplitude(s) for diagram number 1220 + VVVV1_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1221( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1221 OF 1240 *** + // Wavefunction(s) for diagram number 1221 + VVV1P0_1( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] ); + VVV1P0_1( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] ); + VVV1P0_1( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] ); + // Amplitude(s) for diagram number 1221 + VVV1_0( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1222( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1222 OF 1240 *** + // Wavefunction(s) for diagram number 1222 + // (none) + // Amplitude(s) for diagram number 1222 + VVV1_0( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVV1_0( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1223( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1223 OF 1240 *** + // Wavefunction(s) for diagram number 1223 + // (none) + // Amplitude(s) for diagram number 1223 + FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1224( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1224 OF 1240 *** + // Wavefunction(s) for diagram number 1224 + // (none) + // Amplitude(s) for diagram number 1224 + FFV1_0( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1225( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1225 OF 1240 *** + // Wavefunction(s) for diagram number 1225 + // (none) + // Amplitude(s) for diagram number 1225 + FFV1_0( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1226( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1226 OF 1240 *** + // Wavefunction(s) for diagram number 1226 + // (none) + // Amplitude(s) for diagram number 1226 + FFV1_0( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1227( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1227 OF 1240 *** + // Wavefunction(s) for diagram number 1227 + // (none) + // Amplitude(s) for diagram number 1227 + VVVV1_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1228( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1228 OF 1240 *** + // Wavefunction(s) for diagram number 1228 + VVV1P0_1( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] ); + VVV1P0_1( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] ); + VVV1P0_1( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] ); + // Amplitude(s) for diagram number 1228 + VVV1_0( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1229( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1229 OF 1240 *** + // Wavefunction(s) for diagram number 1229 + // (none) + // Amplitude(s) for diagram number 1229 + VVV1_0( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVV1_0( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVV1_0( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1230( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1230 OF 1240 *** + // Wavefunction(s) for diagram number 1230 + // (none) + // Amplitude(s) for diagram number 1230 + FFV1_0( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1231( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1231 OF 1240 *** + // Wavefunction(s) for diagram number 1231 + // (none) + // Amplitude(s) for diagram number 1231 + FFV1_0( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1232( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1232 OF 1240 *** + // Wavefunction(s) for diagram number 1232 + // (none) + // Amplitude(s) for diagram number 1232 + FFV1_0( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1233( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1233 OF 1240 *** + // Wavefunction(s) for diagram number 1233 + // (none) + // Amplitude(s) for diagram number 1233 + FFV1_0( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + FFV1_0( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + FFV1_0( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1234( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1234 OF 1240 *** + // Wavefunction(s) for diagram number 1234 + // (none) + // Amplitude(s) for diagram number 1234 + VVVV1_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1235( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1235 OF 1240 *** + // Wavefunction(s) for diagram number 1235 + VVV1P0_1( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] ); + VVV1P0_1( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] ); + VVV1P0_1( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] ); + // Amplitude(s) for diagram number 1235 + VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1236( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1236 OF 1240 *** + // Wavefunction(s) for diagram number 1236 + // (none) + // Amplitude(s) for diagram number 1236 + VVV1_0( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVV1_0( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1237( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1237 OF 1240 *** + // Wavefunction(s) for diagram number 1237 + // (none) + // Amplitude(s) for diagram number 1237 + FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1238( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1238 OF 1240 *** + // Wavefunction(s) for diagram number 1238 + // (none) + // Amplitude(s) for diagram number 1238 + FFV1_0( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1239( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1239 OF 1240 *** + // Wavefunction(s) for diagram number 1239 + // (none) + // Amplitude(s) for diagram number 1239 + FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1240( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1240 OF 1240 *** + // Wavefunction(s) for diagram number 1240 + // (none) + // Amplitude(s) for diagram number 1240 + FFV1_0( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f index 3671cdce55..def489179c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f index 07ccd4d1a4..ac98d845bd 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -419,7 +419,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -462,7 +462,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(3030) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -505,9375 +506,738 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 1),I= 7, 12) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 1),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 1),I= 19, 24) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 1),I= 25, 30) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 1),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 1),I= 37, 42) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 1),I= 43, 48) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 1),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 1),I= 55, 60) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 1),I= 61, 66) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 1),I= 67, 72) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 1),I= 73, 78) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 1),I= 79, 84) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 1),I= 85, 90) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 1),I= 91, 96) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 1),I= 97,102) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 1),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 1),I=109,114) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 1),I=115,120) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ + DATA DENOM/324/ + DATA (CF(I),I= 1,120) /4096,-1024,-1024,128,128,1280,-1024,128 + $ ,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160,992 + $ ,992,-448,-1024,128,128,-16,-16,-160,128,-16,-16,2,2,20,-16,2, + $ -160,20,-142,-124,2,20,20,-124,-124,56,128,-16,-16,2,2,20,1280, + $ -160,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,-124, + $ -106,-268,-88,-16,2,-160,20,-142,-124,-160,20,992,-124,38,-106 + $ ,992,-124,-448,56,-268,-88,1010,-268,-268,884,884,-232,2,20,20, + $ -124,-124,56,20,200,-124,1028,-106,-88,-124,-106,56,-88,884, + $ -232,1028,-88,-88,-232,-232,272/ C 1 T(1,2,5,6,7,3,4) - DATA (CF(I, 2),I= 1, 6) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 2),I= 7, 12) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 2),I= 13, 18) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 2),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 2),I= 25, 30) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 2),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 2),I= 37, 42) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 2),I= 43, 48) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 2),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 2),I= 55, 60) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 2),I= 61, 66) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 2),I= 67, 72) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 2),I= 73, 78) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 2),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 2),I= 85, 90) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 2),I= 91, 96) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 2),I= 97,102) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 2),I=103,108) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 2),I=109,114) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 2),I=115,120) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ + DATA (CF(I),I=121,239) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992,128,-1024,-16,-160,128,-16,-16,128,2,20,-16,2,2,20,20,-124 + $ ,-124,56,-16,2,-160,20,-142,-124,-16,128,2,20,-16,2,-160,1280 + $ ,20,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142,992,-124 + $ ,1010,1028,2,20,20,-124,-124,56,20,200,-124,1028,-106,-88,-124, + $ -106,56,-88,884,-232,1028,-88,-88,-232,-232,272,-16,2,-160,20, + $ -142,-124,-160,20,992,-124,38,-106,992,-124,-448,56,-268,-88 + $ ,1010,-268,-268,884,884,-232/ C 1 T(1,2,5,7,6,3,4) - DATA (CF(I, 3),I= 1, 6) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 3),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 3),I= 13, 18) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 3),I= 19, 24) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 3),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 3),I= 31, 36) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 3),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 3),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 3),I= 49, 54) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 3),I= 55, 60) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 3),I= 61, 66) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 3),I= 67, 72) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 3),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 3),I= 79, 84) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 3),I= 85, 90) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 3),I= 91, 96) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 3),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 3),I=103,108) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 3),I=109,114) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 3),I=115,120) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ + DATA (CF(I),I=240,357) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992 + $ ,128,-16,-1024,128,-160,-16,-16,2,-160,20,-142,-124,128,-16,-16 + $ ,2,2,20,20,2,-124,56,20,-124,-16,2,-160,20,-142,-124,-160,20 + $ ,992,-124,38,-106,992,-124,-448,56,-268,-88,1010,-268,-268,884 + $ ,884,-232,128,-16,-16,2,2,20,1280,-160,-160,20,20,200,1136,-142 + $ ,992,-124,1010,1028,-142,38,-124,-106,-268,-88,20,2,-124,56,20, + $ -124,200,20,-106,-88,-124,1028,1028,-88,-88,-232,-232,272,-124, + $ -106,56,-88,884,-232/ C 1 T(1,2,6,5,7,3,4) - DATA (CF(I, 4),I= 1, 6) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 4),I= 7, 12) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 4),I= 13, 18) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 4),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 4),I= 25, 30) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 4),I= 31, 36) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 4),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 4),I= 43, 48) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 4),I= 49, 54) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 4),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 4),I= 61, 66) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 4),I= 67, 72) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 4),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 4),I= 79, 84) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 4),I= 85, 90) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 4),I= 91, 96) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 4),I= 97,102) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 4),I=103,108) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 4),I=109,114) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 4),I=115,120) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ + DATA (CF(I),I=358,474) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,-16, + $ -160,128,-1024,-16,128,2,20,20,-124,-124,56,-16,128,2,20,-16,2 + $ ,2,-16,-142,-124,-160,20,2,20,20,-124,-124,56,20,200,-124,1028, + $ -106,-88,-124,-106,56,-88,884,-232,1028,-88,-88,-232,-232,272, + $ -16,128,2,20,-16,2,-160,1280,20,200,-160,20,-142,38,-124,-106, + $ -268,-88,1136,-142,992,-124,1010,1028,2,-16,-142,-124,-160,20 + $ ,20,-160,38,-106,992,-124,1010,-268,-268,884,884,-232,992,-124, + $ -448,56,-268,-88/ C 1 T(1,2,6,7,5,3,4) - DATA (CF(I, 5),I= 1, 6) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 5),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 5),I= 13, 18) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 5),I= 19, 24) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 5),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 5),I= 31, 36) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 5),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 5),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 5),I= 49, 54) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 5),I= 55, 60) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 5),I= 61, 66) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 5),I= 67, 72) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 5),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 5),I= 79, 84) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 5),I= 85, 90) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 5),I= 91, 96) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 5),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 5),I=103,108) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 5),I=109,114) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 5),I=115,120) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ + DATA (CF(I),I=475,590) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,-16,128, + $ -160,-16,-1024,128,2,-16,-142,-124,-160,20,20,2,-124,56,20,-124 + $ ,128,-16,-16,2,2,20,2,-16,-142,-124,-160,20,20,-160,38,-106,992 + $ ,-124,1010,-268,-268,884,884,-232,992,-124,-448,56,-268,-88,20 + $ ,2,-124,56,20,-124,200,20,-106,-88,-124,1028,1028,-88,-88,-232, + $ -232,272,-124,-106,56,-88,884,-232,128,-16,-16,2,2,20,1280,-160 + $ ,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,-124,-106, + $ -268,-88/ C 1 T(1,2,7,5,6,3,4) - DATA (CF(I, 6),I= 1, 6) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 6),I= 7, 12) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 6),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 6),I= 19, 24) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 6),I= 25, 30) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 6),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 6),I= 37, 42) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 6),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 6),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 6),I= 55, 60) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 6),I= 61, 66) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 6),I= 67, 72) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 6),I= 73, 78) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 6),I= 79, 84) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 6),I= 85, 90) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 6),I= 91, 96) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 6),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 6),I=103,108) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 6),I=109,114) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 6),I=115,120) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ + DATA (CF(I),I=591,705) /4096,-160,-16,992,-448,-160,992,-16,128 + $ ,1136,992,1280,-160,128,-1024,-16,-160,128,-16,-160,-16,-16,128 + $ ,128,-1024,20,2,-124,56,20,-124,2,-16,-142,-124,-160,20,-16,128 + $ ,2,20,-16,2,20,2,-124,56,20,-124,200,20,-106,-88,-124,1028,1028 + $ ,-88,-88,-232,-232,272,-124,-106,56,-88,884,-232,2,-16,-142, + $ -124,-160,20,20,-160,38,-106,992,-124,1010,-268,-268,884,884, + $ -232,992,-124,-448,56,-268,-88,-16,128,2,20,-16,2,-160,1280,20 + $ ,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142,992,-124,1010 + $ ,1028/ C 1 T(1,2,7,6,5,3,4) - DATA (CF(I, 7),I= 1, 6) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 7),I= 7, 12) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 7),I= 13, 18) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 7),I= 19, 24) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 7),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 7),I= 31, 36) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 7),I= 37, 42) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 7),I= 43, 48) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 7),I= 49, 54) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 7),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 7),I= 61, 66) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 7),I= 67, 72) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 7),I= 73, 78) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 7),I= 79, 84) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 7),I= 85, 90) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 7),I= 91, 96) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 7),I= 97,102) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 7),I=103,108) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 7),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 7),I=115,120) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ + DATA (CF(I),I=706,819) /4096,-1024,-1024,128,128,1280,1280,-160 + $ ,128,-16,992,1136,-160,992,-16,-160,-448,992,128,-16,-16,2,2,20 + $ ,1280,-160,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38, + $ -124,-106,-268,-88,-1024,128,128,-16,-16,-160,128,-16,-16,2,2 + $ ,20,-16,2,-160,20,-142,-124,2,20,20,-124,-124,56,-160,20,-16,2, + $ -124,-142,992,-124,-448,56,-268,-88,-160,20,992,-124,38,-106, + $ -268,1010,884,-232,-268,884,20,-124,2,20,56,-124,-124,-106,56, + $ -88,884,-232,20,200,-124,1028,-106,-88,-88,1028,-232,272,-88, + $ -232/ C 1 T(1,5,2,6,7,3,4) - DATA (CF(I, 8),I= 1, 6) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 8),I= 7, 12) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 8),I= 13, 18) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 8),I= 19, 24) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 8),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 8),I= 31, 36) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 8),I= 37, 42) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 8),I= 43, 48) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 8),I= 49, 54) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 8),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 8),I= 61, 66) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 8),I= 67, 72) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 8),I= 73, 78) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 8),I= 79, 84) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 8),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 8),I= 91, 96) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 8),I= 97,102) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 8),I=103,108) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 8),I=109,114) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 8),I=115,120) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ + DATA (CF(I),I=820,932) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136,-16,128,2,20,-16,2, + $ -160,1280,20,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142 + $ ,992,-124,1010,1028,128,-1024,-16,-160,128,-16,-16,128,2,20,-16 + $ ,2,2,20,20,-124,-124,56,-16,2,-160,20,-142,-124,20,-124,2,20,56 + $ ,-124,-124,-106,56,-88,884,-232,20,200,-124,1028,-106,-88,-88 + $ ,1028,-232,272,-88,-232,-160,20,-16,2,-124,-142,992,-124,-448 + $ ,56,-268,-88,-160,20,992,-124,38,-106,-268,1010,884,-232,-268 + $ ,884/ C 1 T(1,5,2,7,6,3,4) - DATA (CF(I, 9),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 9),I= 7, 12) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 9),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 9),I= 19, 24) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 9),I= 25, 30) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 9),I= 31, 36) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 9),I= 37, 42) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 9),I= 43, 48) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 9),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 9),I= 55, 60) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 9),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 9),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 9),I= 73, 78) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 9),I= 79, 84) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 9),I= 85, 90) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 9),I= 91, 96) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 9),I= 97,102) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 9),I=103,108) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 9),I=109,114) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 9),I=115,120) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ + DATA (CF(I),I=933,1044) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160,-16,2,-160,20,-142,-124, + $ -160,20,992,-124,38,-106,992,-124,-448,56,-268,-88,1010,-268, + $ -268,884,884,-232,128,-16,-1024,128,-160,-16,-16,2,-160,20,-142 + $ ,-124,128,-16,-16,2,2,20,20,2,-124,56,20,-124,-16,2,128,-16,20 + $ ,2,1136,-142,992,-124,1010,1028,1280,-160,-160,20,20,200,38, + $ -142,-268,-88,-124,-106,-124,56,20,2,-124,20,1028,-88,-88,-232, + $ -232,272,200,20,-106,-88,-124,1028,-106,-124,884,-232,56,-88/ C 1 T(1,5,6,2,7,3,4) - DATA (CF(I, 10),I= 1, 6) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 10),I= 7, 12) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 10),I= 13, 18) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 10),I= 19, 24) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 10),I= 25, 30) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 10),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 10),I= 37, 42) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 10),I= 43, 48) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 10),I= 49, 54) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 10),I= 55, 60) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 10),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 10),I= 67, 72) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 10),I= 73, 78) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 10),I= 79, 84) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 10),I= 85, 90) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 10),I= 91, 96) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 10),I= 97,102) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 10),I=103,108) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 10),I=109,114) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 10),I=115,120) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ + DATA (CF(I),I=1045,1155) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280,2,20,20,-124,-124,56,20,200, + $ -124,1028,-106,-88,-124,-106,56,-88,884,-232,1028,-88,-88,-232, + $ -232,272,-16,-160,128,-1024,-16,128,2,20,20,-124,-124,56,-16 + $ ,128,2,20,-16,2,2,-16,-142,-124,-160,20,2,20,-16,128,2,-16,-142 + $ ,38,-124,-106,-268,-88,-160,1280,20,200,-160,20,-142,1136,1010 + $ ,1028,992,-124,-142,-124,2,-16,20,-160,1010,-268,-268,884,884, + $ -232,20,-160,38,-106,992,-124,-124,992,-268,-88,-448,56/ C 1 T(1,5,6,7,2,3,4) - DATA (CF(I, 11),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 11),I= 7, 12) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 11),I= 13, 18) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 11),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 11),I= 25, 30) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 11),I= 31, 36) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 11),I= 37, 42) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 11),I= 43, 48) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 11),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 11),I= 55, 60) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 11),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 11),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 11),I= 73, 78) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 11),I= 79, 84) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 11),I= 85, 90) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 11),I= 91, 96) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 11),I= 97,102) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 11),I=103,108) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 11),I=109,114) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 11),I=115,120) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ + DATA (CF(I),I=1156,1265) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16,2,-16,-142,-124,-160,20,20,-160,38, + $ -106,992,-124,1010,-268,-268,884,884,-232,992,-124,-448,56,-268 + $ ,-88,-16,128,-160,-16,-1024,128,2,-16,-142,-124,-160,20,20,2, + $ -124,56,20,-124,128,-16,-16,2,2,20,-124,56,20,2,-124,20,1028, + $ -88,-88,-232,-232,272,200,20,-106,-88,-124,1028,-106,-124,884, + $ -232,56,-88,-16,2,128,-16,20,2,1136,-142,992,-124,1010,1028 + $ ,1280,-160,-160,20,20,200,38,-142,-268,-88,-124,-106/ C 1 T(1,5,7,2,6,3,4) - DATA (CF(I, 12),I= 1, 6) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 12),I= 7, 12) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 12),I= 13, 18) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 12),I= 19, 24) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 12),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 12),I= 31, 36) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 12),I= 37, 42) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 12),I= 43, 48) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 12),I= 49, 54) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 12),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 12),I= 61, 66) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 12),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 12),I= 73, 78) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 12),I= 79, 84) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 12),I= 85, 90) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 12),I= 91, 96) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 12),I= 97,102) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 12),I=103,108) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 12),I=109,114) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 12),I=115,120) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ + DATA (CF(I),I=1266,1374) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128,20,2,-124,56,20,-124,200,20,-106,-88, + $ -124,1028,1028,-88,-88,-232,-232,272,-124,-106,56,-88,884,-232, + $ -160,-16,-16,128,128,-1024,20,2,-124,56,20,-124,2,-16,-142,-124 + $ ,-160,20,-16,128,2,20,-16,2,-142,-124,2,-16,20,-160,1010,-268, + $ -268,884,884,-232,20,-160,38,-106,992,-124,-124,992,-268,-88, + $ -448,56,2,20,-16,128,2,-16,-142,38,-124,-106,-268,-88,-160,1280 + $ ,20,200,-160,20,-142,1136,1010,1028,992,-124/ C 1 T(1,5,7,6,2,3,4) - DATA (CF(I, 13),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 13),I= 7, 12) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 13),I= 13, 18) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 13),I= 19, 24) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 13),I= 25, 30) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 13),I= 31, 36) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 13),I= 37, 42) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 13),I= 43, 48) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 13),I= 49, 54) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 13),I= 55, 60) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 13),I= 61, 66) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 13),I= 67, 72) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 13),I= 73, 78) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 13),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 13),I= 85, 90) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 13),I= 91, 96) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 13),I= 97,102) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 13),I=103,108) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 13),I=109,114) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 13),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ + DATA (CF(I),I=1375,1482) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160,-16,2,128,-16,20,2,1136,-142,992,-124,1010 + $ ,1028,1280,-160,-160,20,20,200,38,-142,-268,-88,-124,-106,-160 + $ ,20,-16,2,-124,-142,992,-124,-448,56,-268,-88,-160,20,992,-124 + $ ,38,-106,-268,1010,884,-232,-268,884,-1024,128,128,-16,-16,-160 + $ ,128,-16,-16,2,2,20,-16,2,-160,20,-142,-124,2,20,20,-124,-124 + $ ,56,-124,20,56,-124,2,20,-106,-124,884,-232,56,-88,-88,1028, + $ -232,272,-88,-232,20,200,-124,1028,-106,-88/ C 1 T(1,6,2,5,7,3,4) - DATA (CF(I, 14),I= 1, 6) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 14),I= 7, 12) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 14),I= 13, 18) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 14),I= 19, 24) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 14),I= 25, 30) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 14),I= 31, 36) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 14),I= 37, 42) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 14),I= 43, 48) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 14),I= 49, 54) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 14),I= 55, 60) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 14),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 14),I= 67, 72) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 14),I= 73, 78) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 14),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 14),I= 85, 90) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 14),I= 91, 96) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 14),I= 97,102) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 14),I=103,108) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 14),I=109,114) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 14),I=115,120) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ + DATA (CF(I),I=1483,1589) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16,2,20,-16,128,2,-16,-142,38,-124,-106,-268,-88, + $ -160,1280,20,200,-160,20,-142,1136,1010,1028,992,-124,20,-124,2 + $ ,20,56,-124,-124,-106,56,-88,884,-232,20,200,-124,1028,-106,-88 + $ ,-88,1028,-232,272,-88,-232,128,-1024,-16,-160,128,-16,-16,128 + $ ,2,20,-16,2,2,20,20,-124,-124,56,-16,2,-160,20,-142,-124,20, + $ -160,-124,-142,-16,2,-124,992,-268,-88,-448,56,-268,1010,884, + $ -232,-268,884,-160,20,992,-124,38,-106/ C 1 T(1,6,2,7,5,3,4) - DATA (CF(I, 15),I= 1, 6) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 15),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 15),I= 13, 18) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 15),I= 19, 24) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 15),I= 25, 30) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 15),I= 31, 36) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 15),I= 37, 42) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 15),I= 43, 48) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 15),I= 49, 54) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 15),I= 55, 60) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 15),I= 61, 66) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 15),I= 67, 72) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 15),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 15),I= 79, 84) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 15),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 15),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 15),I= 97,102) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 15),I=103,108) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 15),I=109,114) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 15),I=115,120) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ + DATA (CF(I),I=1590,1695) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16,-160,20,-16,2,-124,-142,992,-124,-448,56,-268,-88,-160 + $ ,20,992,-124,38,-106,-268,1010,884,-232,-268,884,-16,2,128,-16 + $ ,20,2,1136,-142,992,-124,1010,1028,1280,-160,-160,20,20,200,38, + $ -142,-268,-88,-124,-106,128,-16,-1024,128,-160,-16,-16,2,-160 + $ ,20,-142,-124,128,-16,-16,2,2,20,20,2,-124,56,20,-124,56,-124, + $ -124,20,20,2,-88,1028,-232,272,-88,-232,-106,-124,884,-232,56, + $ -88,200,20,-106,-88,-124,1028/ C 1 T(1,6,5,2,7,3,4) - DATA (CF(I, 16),I= 1, 6) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 16),I= 7, 12) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 16),I= 13, 18) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 16),I= 19, 24) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 16),I= 25, 30) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 16),I= 31, 36) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 16),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 16),I= 43, 48) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 16),I= 49, 54) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 16),I= 55, 60) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 16),I= 61, 66) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 16),I= 67, 72) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 16),I= 73, 78) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 16),I= 79, 84) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 16),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 16),I= 91, 96) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 16),I= 97,102) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 16),I=103,108) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 16),I=109,114) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 16),I=115,120) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ + DATA (CF(I),I=1696,1800) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128,20,-124,2,20,56,-124,-124,-106,56,-88,884,-232,20,200,-124 + $ ,1028,-106,-88,-88,1028,-232,272,-88,-232,2,20,-16,128,2,-16, + $ -142,38,-124,-106,-268,-88,-160,1280,20,200,-160,20,-142,1136 + $ ,1010,1028,992,-124,-16,-160,128,-1024,-16,128,2,20,20,-124, + $ -124,56,-16,128,2,20,-16,2,2,-16,-142,-124,-160,20,-124,-142,20 + $ ,-160,2,-16,-268,1010,884,-232,-268,884,-124,992,-268,-88,-448 + $ ,56,20,-160,38,-106,992,-124/ C 1 T(1,6,5,7,2,3,4) - DATA (CF(I, 17),I= 1, 6) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 17),I= 7, 12) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 17),I= 13, 18) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 17),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 17),I= 25, 30) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 17),I= 31, 36) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 17),I= 37, 42) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 17),I= 43, 48) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 17),I= 49, 54) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 17),I= 55, 60) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 17),I= 61, 66) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 17),I= 67, 72) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 17),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 17),I= 79, 84) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 17),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 17),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 17),I= 97,102) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 17),I=103,108) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 17),I=109,114) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 17),I=115,120) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ + DATA (CF(I),I=1801,1904) /4096,-1024,-16,128,-160,-16,-1024,128, + $ -142,-124,2,-16,20,-160,1010,-268,-268,884,884,-232,20,-160,38, + $ -106,992,-124,-124,992,-268,-88,-448,56,-124,56,20,2,-124,20 + $ ,1028,-88,-88,-232,-232,272,200,20,-106,-88,-124,1028,-106,-124 + $ ,884,-232,56,-88,-16,128,-160,-16,-1024,128,2,-16,-142,-124, + $ -160,20,20,2,-124,56,20,-124,128,-16,-16,2,2,20,2,-16,20,2,128, + $ -16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,-124,-106 + $ ,1280,-160,-160,20,20,200/ C 1 T(1,6,7,2,5,3,4) - DATA (CF(I, 18),I= 1, 6) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 18),I= 7, 12) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 18),I= 13, 18) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 18),I= 19, 24) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 18),I= 25, 30) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 18),I= 31, 36) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 18),I= 37, 42) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 18),I= 43, 48) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 18),I= 49, 54) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 18),I= 55, 60) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 18),I= 61, 66) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 18),I= 67, 72) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 18),I= 73, 78) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 18),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 18),I= 85, 90) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 18),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 18),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 18),I=103,108) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 18),I=109,114) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 18),I=115,120) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ + DATA (CF(I),I=1905,2007) /4096,-160,-16,-16,128,128,-1024,-124 + $ ,56,20,2,-124,20,1028,-88,-88,-232,-232,272,200,20,-106,-88, + $ -124,1028,-106,-124,884,-232,56,-88,-142,-124,2,-16,20,-160 + $ ,1010,-268,-268,884,884,-232,20,-160,38,-106,992,-124,-124,992, + $ -268,-88,-448,56,-160,-16,-16,128,128,-1024,20,2,-124,56,20, + $ -124,2,-16,-142,-124,-160,20,-16,128,2,20,-16,2,20,2,2,-16,-16 + $ ,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124, + $ -160,1280,20,200,-160,20/ C 1 T(1,6,7,5,2,3,4) - DATA (CF(I, 19),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 19),I= 7, 12) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 19),I= 13, 18) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 19),I= 19, 24) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 19),I= 25, 30) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 19),I= 31, 36) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 19),I= 37, 42) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 19),I= 43, 48) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 19),I= 49, 54) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 19),I= 55, 60) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 19),I= 61, 66) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 19),I= 67, 72) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 19),I= 73, 78) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 19),I= 79, 84) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 19),I= 85, 90) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 19),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 19),I= 97,102) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 19),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 19),I=109,114) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 19),I=115,120) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ + DATA (CF(I),I=2008,2109) /4096,-1024,-1024,128,128,1280,2,-16,20 + $ ,2,128,-16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,-124, + $ -106,1280,-160,-160,20,20,200,20,-160,-124,-142,-16,2,-124,992, + $ -268,-88,-448,56,-268,1010,884,-232,-268,884,-160,20,992,-124 + $ ,38,-106,-124,20,56,-124,2,20,-106,-124,884,-232,56,-88,-88 + $ ,1028,-232,272,-88,-232,20,200,-124,1028,-106,-88,-1024,128,128 + $ ,-16,-16,-160,128,-16,-16,2,2,20,-16,2,-160,20,-142,-124,2,20 + $ ,20,-124,-124,56/ C 1 T(1,7,2,5,6,3,4) - DATA (CF(I, 20),I= 1, 6) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 20),I= 7, 12) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 20),I= 13, 18) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 20),I= 19, 24) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 20),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 20),I= 31, 36) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 20),I= 37, 42) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 20),I= 43, 48) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 20),I= 49, 54) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 20),I= 55, 60) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 20),I= 61, 66) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 20),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 20),I= 73, 78) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 20),I= 79, 84) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 20),I= 85, 90) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 20),I= 91, 96) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 20),I= 97,102) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 20),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 20),I=109,114) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 20),I=115,120) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ + DATA (CF(I),I=2110,2210) /4096,128,1280,-1024,128,20,2,2,-16,-16 + $ ,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124, + $ -160,1280,20,200,-160,20,-124,20,56,-124,2,20,-106,-124,884, + $ -232,56,-88,-88,1028,-232,272,-88,-232,20,200,-124,1028,-106, + $ -88,20,-160,-124,-142,-16,2,-124,992,-268,-88,-448,56,-268,1010 + $ ,884,-232,-268,884,-160,20,992,-124,38,-106,128,-1024,-16,-160 + $ ,128,-16,-16,128,2,20,-16,2,2,20,20,-124,-124,56,-16,2,-160,20, + $ -142,-124/ C 1 T(1,7,2,6,5,3,4) - DATA (CF(I, 21),I= 1, 6) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 21),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 21),I= 13, 18) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 21),I= 19, 24) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 21),I= 25, 30) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 21),I= 31, 36) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 21),I= 37, 42) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 21),I= 43, 48) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 21),I= 49, 54) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 21),I= 55, 60) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 21),I= 61, 66) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 21),I= 67, 72) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 21),I= 73, 78) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 21),I= 79, 84) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 21),I= 85, 90) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 21),I= 91, 96) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 21),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 21),I=103,108) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 21),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 21),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ + DATA (CF(I),I=2211,2310) /4096,-1024,1280,128,20,-160,-124,-142, + $ -16,2,-124,992,-268,-88,-448,56,-268,1010,884,-232,-268,884, + $ -160,20,992,-124,38,-106,2,-16,20,2,128,-16,-142,1136,1010,1028 + $ ,992,-124,38,-142,-268,-88,-124,-106,1280,-160,-160,20,20,200 + $ ,56,-124,-124,20,20,2,-88,1028,-232,272,-88,-232,-106,-124,884, + $ -232,56,-88,200,20,-106,-88,-124,1028,128,-16,-1024,128,-160, + $ -16,-16,2,-160,20,-142,-124,128,-16,-16,2,2,20,20,2,-124,56,20, + $ -124/ C 1 T(1,7,5,2,6,3,4) - DATA (CF(I, 22),I= 1, 6) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 22),I= 7, 12) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 22),I= 13, 18) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 22),I= 19, 24) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 22),I= 25, 30) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 22),I= 31, 36) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 22),I= 37, 42) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 22),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 22),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 22),I= 55, 60) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 22),I= 61, 66) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 22),I= 67, 72) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 22),I= 73, 78) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 22),I= 79, 84) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 22),I= 85, 90) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 22),I= 91, 96) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 22),I= 97,102) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 22),I=103,108) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 22),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 22),I=115,120) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ + DATA (CF(I),I=2311,2409) /4096,128,-1024,-124,20,56,-124,2,20, + $ -106,-124,884,-232,56,-88,-88,1028,-232,272,-88,-232,20,200, + $ -124,1028,-106,-88,20,2,2,-16,-16,128,38,-142,-268,-88,-124, + $ -106,-142,1136,1010,1028,992,-124,-160,1280,20,200,-160,20,-124 + $ ,-142,20,-160,2,-16,-268,1010,884,-232,-268,884,-124,992,-268, + $ -88,-448,56,20,-160,38,-106,992,-124,-16,-160,128,-1024,-16,128 + $ ,2,20,20,-124,-124,56,-16,128,2,20,-16,2,2,-16,-142,-124,-160 + $ ,20/ C 1 T(1,7,5,6,2,3,4) - DATA (CF(I, 23),I= 1, 6) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 23),I= 7, 12) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 23),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 23),I= 19, 24) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 23),I= 25, 30) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 23),I= 31, 36) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 23),I= 37, 42) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 23),I= 43, 48) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 23),I= 49, 54) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 23),I= 55, 60) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 23),I= 61, 66) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 23),I= 67, 72) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 23),I= 73, 78) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 23),I= 79, 84) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 23),I= 85, 90) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 23),I= 91, 96) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 23),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 23),I=103,108) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 23),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 23),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ + DATA (CF(I),I=2410,2507) /4096,-1024,-124,-142,20,-160,2,-16, + $ -268,1010,884,-232,-268,884,-124,992,-268,-88,-448,56,20,-160 + $ ,38,-106,992,-124,56,-124,-124,20,20,2,-88,1028,-232,272,-88, + $ -232,-106,-124,884,-232,56,-88,200,20,-106,-88,-124,1028,2,-16 + $ ,20,2,128,-16,-142,1136,1010,1028,992,-124,38,-142,-268,-88, + $ -124,-106,1280,-160,-160,20,20,200,-16,128,-160,-16,-1024,128,2 + $ ,-16,-142,-124,-160,20,20,2,-124,56,20,-124,128,-16,-16,2,2,20/ C 1 T(1,7,6,2,5,3,4) - DATA (CF(I, 24),I= 1, 6) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 24),I= 7, 12) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 24),I= 13, 18) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 24),I= 19, 24) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 24),I= 25, 30) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 24),I= 31, 36) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 24),I= 37, 42) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 24),I= 43, 48) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 24),I= 49, 54) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 24),I= 55, 60) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 24),I= 61, 66) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 24),I= 67, 72) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 24),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 24),I= 79, 84) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 24),I= 85, 90) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 24),I= 91, 96) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 24),I= 97,102) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 24),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 24),I=109,114) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 24),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ + DATA (CF(I),I=2508,2604) /4096,56,-124,-124,20,20,2,-88,1028, + $ -232,272,-88,-232,-106,-124,884,-232,56,-88,200,20,-106,-88, + $ -124,1028,-124,-142,20,-160,2,-16,-268,1010,884,-232,-268,884, + $ -124,992,-268,-88,-448,56,20,-160,38,-106,992,-124,20,2,2,-16, + $ -16,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124 + $ ,-160,1280,20,200,-160,20,-160,-16,-16,128,128,-1024,20,2,-124 + $ ,56,20,-124,2,-16,-142,-124,-160,20,-16,128,2,20,-16,2/ C 1 T(1,7,6,5,2,3,4) - DATA (CF(I, 25),I= 1, 6) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 25),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 25),I= 13, 18) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 25),I= 19, 24) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 25),I= 25, 30) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 25),I= 31, 36) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 25),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 25),I= 43, 48) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 25),I= 49, 54) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 25),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 25),I= 61, 66) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 25),I= 67, 72) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 25),I= 73, 78) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 25),I= 79, 84) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 25),I= 85, 90) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 25),I= 91, 96) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 25),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 25),I=103,108) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 25),I=109,114) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 25),I=115,120) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ + DATA (CF(I),I=2605,2700) /4096,-1024,-1024,128,128,1280,-1024 + $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160 + $ ,992,992,-448,1280,-160,-160,20,20,200,128,-16,-16,2,2,20,992, + $ -124,1136,-142,1028,1010,-124,-106,-142,38,-88,-268,-160,20,992 + $ ,-124,38,-106,-16,2,-160,20,-142,-124,-448,56,992,-124,-88,-268 + $ ,-268,884,1010,-268,-232,884,20,200,-124,1028,-106,-88,2,20,20, + $ -124,-124,56,56,-88,-124,-106,-232,884,-88,-232,1028,-88,272, + $ -232/ C 1 T(2,1,5,6,7,3,4) - DATA (CF(I, 26),I= 1, 6) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 26),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 26),I= 13, 18) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 26),I= 19, 24) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 26),I= 25, 30) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 26),I= 31, 36) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 26),I= 37, 42) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 26),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 26),I= 49, 54) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 26),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 26),I= 61, 66) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 26),I= 67, 72) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 26),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 26),I= 79, 84) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 26),I= 85, 90) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 26),I= 91, 96) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 26),I= 97,102) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 26),I=103,108) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 26),I=109,114) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 26),I=115,120) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ + DATA (CF(I),I=2701,2795) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992,-160,1280,20,200,-160,20,-16,128,2,20,-16,2,-124,-106,-142 + $ ,38,-88,-268,992,-124,1136,-142,1028,1010,20,200,-124,1028,-106 + $ ,-88,2,20,20,-124,-124,56,56,-88,-124,-106,-232,884,-88,-232 + $ ,1028,-88,272,-232,-160,20,992,-124,38,-106,-16,2,-160,20,-142, + $ -124,-448,56,992,-124,-88,-268,-268,884,1010,-268,-232,884/ C 1 T(2,1,5,7,6,3,4) - DATA (CF(I, 27),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 27),I= 7, 12) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 27),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 27),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 27),I= 25, 30) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 27),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 27),I= 37, 42) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 27),I= 43, 48) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 27),I= 49, 54) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 27),I= 55, 60) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 27),I= 61, 66) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 27),I= 67, 72) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 27),I= 73, 78) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 27),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 27),I= 85, 90) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 27),I= 91, 96) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 27),I= 97,102) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 27),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 27),I=109,114) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 27),I=115,120) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ + DATA (CF(I),I=2796,2889) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992 + $ ,-160,20,992,-124,38,-106,-16,2,-160,20,-142,-124,-448,56,992, + $ -124,-88,-268,-268,884,1010,-268,-232,884,1280,-160,-160,20,20 + $ ,200,128,-16,-16,2,2,20,992,-124,1136,-142,1028,1010,-124,-106, + $ -142,38,-88,-268,200,20,-106,-88,-124,1028,20,2,-124,56,20,-124 + $ ,-88,-232,1028,-88,272,-232,56,-88,-124,-106,-232,884/ C 1 T(2,1,6,5,7,3,4) - DATA (CF(I, 28),I= 1, 6) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 28),I= 7, 12) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 28),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 28),I= 19, 24) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 28),I= 25, 30) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 28),I= 31, 36) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 28),I= 37, 42) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 28),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 28),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 28),I= 55, 60) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 28),I= 61, 66) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 28),I= 67, 72) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 28),I= 73, 78) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 28),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 28),I= 85, 90) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 28),I= 91, 96) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 28),I= 97,102) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 28),I=103,108) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 28),I=109,114) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 28),I=115,120) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ + DATA (CF(I),I=2890,2982) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,20 + $ ,200,-124,1028,-106,-88,2,20,20,-124,-124,56,56,-88,-124,-106, + $ -232,884,-88,-232,1028,-88,272,-232,-160,1280,20,200,-160,20, + $ -16,128,2,20,-16,2,-124,-106,-142,38,-88,-268,992,-124,1136, + $ -142,1028,1010,20,-160,38,-106,992,-124,2,-16,-142,-124,-160,20 + $ ,-268,884,1010,-268,-232,884,-448,56,992,-124,-88,-268/ C 1 T(2,1,6,7,5,3,4) - DATA (CF(I, 29),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 29),I= 7, 12) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 29),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 29),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 29),I= 25, 30) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 29),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 29),I= 37, 42) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 29),I= 43, 48) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 29),I= 49, 54) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 29),I= 55, 60) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 29),I= 61, 66) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 29),I= 67, 72) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 29),I= 73, 78) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 29),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 29),I= 85, 90) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 29),I= 91, 96) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 29),I= 97,102) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 29),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 29),I=109,114) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 29),I=115,120) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ + DATA (CF(I),I=2983,3074) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,20,-160 + $ ,38,-106,992,-124,2,-16,-142,-124,-160,20,-268,884,1010,-268, + $ -232,884,-448,56,992,-124,-88,-268,200,20,-106,-88,-124,1028,20 + $ ,2,-124,56,20,-124,-88,-232,1028,-88,272,-232,56,-88,-124,-106, + $ -232,884,1280,-160,-160,20,20,200,128,-16,-16,2,2,20,992,-124 + $ ,1136,-142,1028,1010,-124,-106,-142,38,-88,-268/ C 1 T(2,1,7,5,6,3,4) - DATA (CF(I, 30),I= 1, 6) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 30),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 30),I= 13, 18) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 30),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 30),I= 25, 30) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 30),I= 31, 36) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 30),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 30),I= 43, 48) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 30),I= 49, 54) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 30),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 30),I= 61, 66) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 30),I= 67, 72) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 30),I= 73, 78) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 30),I= 79, 84) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 30),I= 85, 90) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 30),I= 91, 96) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 30),I= 97,102) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 30),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 30),I=109,114) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 30),I=115,120) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ + DATA (CF(I),I=3075,3165) /4096,-160,-16,992,-448,-160,992,-16 + $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,200,20,-106, + $ -88,-124,1028,20,2,-124,56,20,-124,-88,-232,1028,-88,272,-232 + $ ,56,-88,-124,-106,-232,884,20,-160,38,-106,992,-124,2,-16,-142, + $ -124,-160,20,-268,884,1010,-268,-232,884,-448,56,992,-124,-88, + $ -268,-160,1280,20,200,-160,20,-16,128,2,20,-16,2,-124,-106,-142 + $ ,38,-88,-268,992,-124,1136,-142,1028,1010/ C 1 T(2,1,7,6,5,3,4) - DATA (CF(I, 31),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 31),I= 7, 12) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 31),I= 13, 18) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 31),I= 19, 24) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 31),I= 25, 30) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 31),I= 31, 36) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 31),I= 37, 42) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 31),I= 43, 48) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 31),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 31),I= 55, 60) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 31),I= 61, 66) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 31),I= 67, 72) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 31),I= 73, 78) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 31),I= 79, 84) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 31),I= 85, 90) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 31),I= 91, 96) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 31),I= 97,102) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 31),I=103,108) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 31),I=109,114) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 31),I=115,120) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ + DATA (CF(I),I=3166,3255) /4096,-1024,-1024,128,128,1280,1280, + $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,128,-16,-16,2 + $ ,2,20,-1024,128,128,-16,-16,-160,-160,20,-16,2,-124,-142,20, + $ -124,2,20,56,-124,992,-124,-448,56,-268,-88,-160,20,-16,2,-124, + $ -142,992,-124,-160,20,-106,38,884,-232,-268,1010,884,-268,-124, + $ -106,56,-88,884,-232,20,-124,2,20,56,-124,-124,1028,20,200,-88, + $ -106,-232,272,-88,1028,-232,-88/ C 1 T(2,5,1,6,7,3,4) - DATA (CF(I, 32),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 32),I= 7, 12) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 32),I= 13, 18) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 32),I= 19, 24) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 32),I= 25, 30) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 32),I= 31, 36) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 32),I= 37, 42) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 32),I= 43, 48) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 32),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 32),I= 55, 60) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 32),I= 61, 66) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 32),I= 67, 72) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 32),I= 73, 78) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 32),I= 79, 84) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 32),I= 85, 90) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 32),I= 91, 96) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 32),I= 97,102) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 32),I=103,108) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 32),I=109,114) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 32),I=115,120) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ + DATA (CF(I),I=3256,3344) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136,-16,128,2,20,-16,2,128 + $ ,-1024,-16,-160,128,-16,20,-124,2,20,56,-124,-160,20,-16,2,-124 + $ ,-142,-124,-106,56,-88,884,-232,20,-124,2,20,56,-124,-124,1028 + $ ,20,200,-88,-106,-232,272,-88,1028,-232,-88,992,-124,-448,56, + $ -268,-88,-160,20,-16,2,-124,-142,992,-124,-160,20,-106,38,884, + $ -232,-268,1010,884,-268/ C 1 T(2,5,1,7,6,3,4) - DATA (CF(I, 33),I= 1, 6) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 33),I= 7, 12) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 33),I= 13, 18) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 33),I= 19, 24) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 33),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 33),I= 31, 36) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 33),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 33),I= 43, 48) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 33),I= 49, 54) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 33),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 33),I= 61, 66) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 33),I= 67, 72) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 33),I= 73, 78) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 33),I= 79, 84) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 33),I= 85, 90) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 33),I= 91, 96) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 33),I= 97,102) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 33),I=103,108) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 33),I=109,114) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 33),I=115,120) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ + DATA (CF(I),I=3345,3432) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160,-16,2,-160,20,-142,-124,128 + $ ,-16,-1024,128,-160,-16,-16,2,128,-16,20,2,-124,56,20,2,-124,20 + $ ,1136,-142,992,-124,1010,1028,-16,2,128,-16,20,2,-160,20,1280, + $ -160,200,20,-268,-88,38,-142,-106,-124,1028,-88,-88,-232,-232 + $ ,272,-124,56,20,2,-124,20,-106,-88,200,20,1028,-124,884,-232, + $ -106,-124,-88,56/ C 1 T(2,5,6,1,7,3,4) - DATA (CF(I, 34),I= 1, 6) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 34),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 34),I= 13, 18) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 34),I= 19, 24) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 34),I= 25, 30) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 34),I= 31, 36) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 34),I= 37, 42) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 34),I= 43, 48) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 34),I= 49, 54) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 34),I= 55, 60) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 34),I= 61, 66) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 34),I= 67, 72) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 34),I= 73, 78) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 34),I= 79, 84) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 34),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 34),I= 91, 96) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 34),I= 97,102) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 34),I=103,108) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 34),I=109,114) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 34),I=115,120) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ + DATA (CF(I),I=3433,3519) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280,2,20,20,-124,-124,56,-16,-160 + $ ,128,-1024,-16,128,2,20,-16,128,2,-16,-142,-124,2,-16,20,-160, + $ -142,38,-124,-106,-268,-88,2,20,-16,128,2,-16,20,200,-160,1280 + $ ,20,-160,1010,1028,-142,1136,-124,992,1010,-268,-268,884,884, + $ -232,-142,-124,2,-16,20,-160,38,-106,20,-160,-124,992,-268,-88, + $ -124,992,56,-448/ C 1 T(2,5,6,7,1,3,4) - DATA (CF(I, 35),I= 1, 6) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 35),I= 7, 12) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 35),I= 13, 18) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 35),I= 19, 24) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 35),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 35),I= 31, 36) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 35),I= 37, 42) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 35),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 35),I= 49, 54) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 35),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 35),I= 61, 66) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 35),I= 67, 72) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 35),I= 73, 78) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 35),I= 79, 84) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 35),I= 85, 90) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 35),I= 91, 96) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 35),I= 97,102) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 35),I=103,108) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 35),I=109,114) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 35),I=115,120) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ + DATA (CF(I),I=3520,3605) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16,2,-16,-142,-124,-160,20,-16,128, + $ -160,-16,-1024,128,-124,56,20,2,-124,20,-16,2,128,-16,20,2,1028 + $ ,-88,-88,-232,-232,272,-124,56,20,2,-124,20,-106,-88,200,20 + $ ,1028,-124,884,-232,-106,-124,-88,56,1136,-142,992,-124,1010 + $ ,1028,-16,2,128,-16,20,2,-160,20,1280,-160,200,20,-268,-88,38, + $ -142,-106,-124/ C 1 T(2,5,7,1,6,3,4) - DATA (CF(I, 36),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 36),I= 7, 12) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 36),I= 13, 18) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 36),I= 19, 24) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 36),I= 25, 30) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 36),I= 31, 36) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 36),I= 37, 42) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 36),I= 43, 48) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 36),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 36),I= 55, 60) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 36),I= 61, 66) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 36),I= 67, 72) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 36),I= 73, 78) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 36),I= 79, 84) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 36),I= 85, 90) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 36),I= 91, 96) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 36),I= 97,102) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 36),I=103,108) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 36),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 36),I=115,120) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=3606,3690) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128,20,2,-124,56,20,-124,-160,-16,-16,128 + $ ,128,-1024,-142,-124,2,-16,20,-160,2,20,-16,128,2,-16,1010,-268 + $ ,-268,884,884,-232,-142,-124,2,-16,20,-160,38,-106,20,-160,-124 + $ ,992,-268,-88,-124,992,56,-448,-142,38,-124,-106,-268,-88,2,20, + $ -16,128,2,-16,20,200,-160,1280,20,-160,1010,1028,-142,1136,-124 + $ ,992/ C 1 T(2,5,7,6,1,3,4) - DATA (CF(I, 37),I= 1, 6) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 37),I= 7, 12) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 37),I= 13, 18) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 37),I= 19, 24) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 37),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 37),I= 31, 36) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 37),I= 37, 42) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 37),I= 43, 48) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 37),I= 49, 54) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 37),I= 55, 60) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 37),I= 61, 66) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 37),I= 67, 72) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 37),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 37),I= 79, 84) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 37),I= 85, 90) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 37),I= 91, 96) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 37),I= 97,102) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 37),I=103,108) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 37),I=109,114) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 37),I=115,120) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ + DATA (CF(I),I=3691,3774) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160,992,-124,-448,56,-268,-88,-160,20,-16,2,-124 + $ ,-142,992,-124,-160,20,-106,38,884,-232,-268,1010,884,-268,128, + $ -16,-16,2,2,20,-1024,128,128,-16,-16,-160,-160,20,-16,2,-124, + $ -142,20,-124,2,20,56,-124,-106,-124,884,-232,56,-88,-124,20,56, + $ -124,2,20,-232,272,-88,1028,-232,-88,-124,1028,20,200,-88,-106/ C 1 T(2,6,1,5,7,3,4) - DATA (CF(I, 38),I= 1, 6) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 38),I= 7, 12) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 38),I= 13, 18) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 38),I= 19, 24) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 38),I= 25, 30) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 38),I= 31, 36) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 38),I= 37, 42) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 38),I= 43, 48) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 38),I= 49, 54) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 38),I= 55, 60) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 38),I= 61, 66) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 38),I= 67, 72) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 38),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 38),I= 79, 84) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 38),I= 85, 90) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 38),I= 91, 96) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 38),I= 97,102) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 38),I=103,108) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 38),I=109,114) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 38),I=115,120) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ + DATA (CF(I),I=3775,3857) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16,-124,-106,56,-88,884,-232,20,-124,2,20,56,-124, + $ -124,1028,20,200,-88,-106,-232,272,-88,1028,-232,-88,-16,128,2 + $ ,20,-16,2,128,-1024,-16,-160,128,-16,20,-124,2,20,56,-124,-160 + $ ,20,-16,2,-124,-142,-124,992,-268,-88,-448,56,20,-160,-124,-142 + $ ,-16,2,884,-232,-268,1010,884,-268,992,-124,-160,20,-106,38/ C 1 T(2,6,1,7,5,3,4) - DATA (CF(I, 39),I= 1, 6) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 39),I= 7, 12) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 39),I= 13, 18) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 39),I= 19, 24) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 39),I= 25, 30) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 39),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 39),I= 37, 42) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 39),I= 43, 48) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 39),I= 49, 54) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 39),I= 55, 60) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 39),I= 61, 66) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 39),I= 67, 72) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 39),I= 73, 78) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 39),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 39),I= 85, 90) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 39),I= 91, 96) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 39),I= 97,102) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 39),I=103,108) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 39),I=109,114) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 39),I=115,120) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ + DATA (CF(I),I=3858,3939) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16,1136,-142,992,-124,1010,1028,-16,2,128,-16,20,2,-160 + $ ,20,1280,-160,200,20,-268,-88,38,-142,-106,-124,-16,2,-160,20, + $ -142,-124,128,-16,-1024,128,-160,-16,-16,2,128,-16,20,2,-124,56 + $ ,20,2,-124,20,-88,1028,-232,272,-88,-232,56,-124,-124,20,20,2 + $ ,884,-232,-106,-124,-88,56,-106,-88,200,20,1028,-124/ C 1 T(2,6,5,1,7,3,4) - DATA (CF(I, 40),I= 1, 6) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 40),I= 7, 12) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 40),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 40),I= 19, 24) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 40),I= 25, 30) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 40),I= 31, 36) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 40),I= 37, 42) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 40),I= 43, 48) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 40),I= 49, 54) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 40),I= 55, 60) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 40),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 40),I= 67, 72) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 40),I= 73, 78) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 40),I= 79, 84) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 40),I= 85, 90) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 40),I= 91, 96) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 40),I= 97,102) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 40),I=103,108) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 40),I=109,114) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 40),I=115,120) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=3940,4020) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128,-142,38,-124,-106,-268,-88,2,20,-16,128,2,-16,20,200,-160 + $ ,1280,20,-160,1010,1028,-142,1136,-124,992,2,20,20,-124,-124,56 + $ ,-16,-160,128,-1024,-16,128,2,20,-16,128,2,-16,-142,-124,2,-16 + $ ,20,-160,-268,1010,884,-232,-268,884,-124,-142,20,-160,2,-16, + $ -268,-88,-124,992,56,-448,38,-106,20,-160,-124,992/ C 1 T(2,6,5,7,1,3,4) - DATA (CF(I, 41),I= 1, 6) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 41),I= 7, 12) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 41),I= 13, 18) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 41),I= 19, 24) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 41),I= 25, 30) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 41),I= 31, 36) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 41),I= 37, 42) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 41),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 41),I= 49, 54) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 41),I= 55, 60) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 41),I= 61, 66) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 41),I= 67, 72) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 41),I= 73, 78) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 41),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 41),I= 85, 90) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 41),I= 91, 96) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 41),I= 97,102) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 41),I=103,108) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 41),I=109,114) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 41),I=115,120) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ + DATA (CF(I),I=4021,4100) /4096,-1024,-16,128,-160,-16,-1024,128 + $ ,1028,-88,-88,-232,-232,272,-124,56,20,2,-124,20,-106,-88,200 + $ ,20,1028,-124,884,-232,-106,-124,-88,56,2,-16,-142,-124,-160,20 + $ ,-16,128,-160,-16,-1024,128,-124,56,20,2,-124,20,-16,2,128,-16 + $ ,20,2,-142,1136,1010,1028,992,-124,2,-16,20,2,128,-16,-268,-88 + $ ,38,-142,-106,-124,-160,20,1280,-160,200,20/ C 1 T(2,6,7,1,5,3,4) - DATA (CF(I, 42),I= 1, 6) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 42),I= 7, 12) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 42),I= 13, 18) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 42),I= 19, 24) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 42),I= 25, 30) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 42),I= 31, 36) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 42),I= 37, 42) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 42),I= 43, 48) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 42),I= 49, 54) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 42),I= 55, 60) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 42),I= 61, 66) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 42),I= 67, 72) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 42),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 42),I= 79, 84) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 42),I= 85, 90) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 42),I= 91, 96) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 42),I= 97,102) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 42),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 42),I=109,114) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 42),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=4101,4179) /4096,-160,-16,-16,128,128,-1024,1010, + $ -268,-268,884,884,-232,-142,-124,2,-16,20,-160,38,-106,20,-160, + $ -124,992,-268,-88,-124,992,56,-448,20,2,-124,56,20,-124,-160, + $ -16,-16,128,128,-1024,-142,-124,2,-16,20,-160,2,20,-16,128,2, + $ -16,38,-142,-268,-88,-124,-106,20,2,2,-16,-16,128,1010,1028, + $ -142,1136,-124,992,20,200,-160,1280,20,-160/ C 1 T(2,6,7,5,1,3,4) - DATA (CF(I, 43),I= 1, 6) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 43),I= 7, 12) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 43),I= 13, 18) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 43),I= 19, 24) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 43),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 43),I= 31, 36) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 43),I= 37, 42) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 43),I= 43, 48) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 43),I= 49, 54) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 43),I= 55, 60) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 43),I= 61, 66) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 43),I= 67, 72) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 43),I= 73, 78) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 43),I= 79, 84) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 43),I= 85, 90) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 43),I= 91, 96) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 43),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 43),I=103,108) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 43),I=109,114) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 43),I=115,120) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ + DATA (CF(I),I=4180,4257) /4096,-1024,-1024,128,128,1280,-124,992 + $ ,-268,-88,-448,56,20,-160,-124,-142,-16,2,884,-232,-268,1010 + $ ,884,-268,992,-124,-160,20,-106,38,-106,-124,884,-232,56,-88, + $ -124,20,56,-124,2,20,-232,272,-88,1028,-232,-88,-124,1028,20 + $ ,200,-88,-106,128,-16,-16,2,2,20,-1024,128,128,-16,-16,-160, + $ -160,20,-16,2,-124,-142,20,-124,2,20,56,-124/ C 1 T(2,7,1,5,6,3,4) - DATA (CF(I, 44),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 44),I= 7, 12) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 44),I= 13, 18) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 44),I= 19, 24) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 44),I= 25, 30) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 44),I= 31, 36) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 44),I= 37, 42) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 44),I= 43, 48) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 44),I= 49, 54) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 44),I= 55, 60) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 44),I= 61, 66) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 44),I= 67, 72) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 44),I= 73, 78) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 44),I= 79, 84) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 44),I= 85, 90) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 44),I= 91, 96) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 44),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 44),I=103,108) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 44),I=109,114) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 44),I=115,120) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ + DATA (CF(I),I=4258,4334) /4096,128,1280,-1024,128,-106,-124,884, + $ -232,56,-88,-124,20,56,-124,2,20,-232,272,-88,1028,-232,-88, + $ -124,1028,20,200,-88,-106,-124,992,-268,-88,-448,56,20,-160, + $ -124,-142,-16,2,884,-232,-268,1010,884,-268,992,-124,-160,20, + $ -106,38,-16,128,2,20,-16,2,128,-1024,-16,-160,128,-16,20,-124,2 + $ ,20,56,-124,-160,20,-16,2,-124,-142/ C 1 T(2,7,1,6,5,3,4) - DATA (CF(I, 45),I= 1, 6) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 45),I= 7, 12) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 45),I= 13, 18) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 45),I= 19, 24) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 45),I= 25, 30) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 45),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 45),I= 37, 42) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 45),I= 43, 48) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 45),I= 49, 54) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 45),I= 55, 60) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 45),I= 61, 66) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 45),I= 67, 72) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 45),I= 73, 78) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 45),I= 79, 84) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 45),I= 85, 90) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 45),I= 91, 96) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 45),I= 97,102) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 45),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 45),I=109,114) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 45),I=115,120) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ + DATA (CF(I),I=4335,4410) /4096,-1024,1280,128,-142,1136,1010 + $ ,1028,992,-124,2,-16,20,2,128,-16,-268,-88,38,-142,-106,-124, + $ -160,20,1280,-160,200,20,-88,1028,-232,272,-88,-232,56,-124, + $ -124,20,20,2,884,-232,-106,-124,-88,56,-106,-88,200,20,1028, + $ -124,-16,2,-160,20,-142,-124,128,-16,-1024,128,-160,-16,-16,2 + $ ,128,-16,20,2,-124,56,20,2,-124,20/ C 1 T(2,7,5,1,6,3,4) - DATA (CF(I, 46),I= 1, 6) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 46),I= 7, 12) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 46),I= 13, 18) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 46),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 46),I= 25, 30) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 46),I= 31, 36) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 46),I= 37, 42) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 46),I= 43, 48) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 46),I= 49, 54) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 46),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 46),I= 61, 66) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 46),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 46),I= 73, 78) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 46),I= 79, 84) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 46),I= 85, 90) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 46),I= 91, 96) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 46),I= 97,102) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 46),I=103,108) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 46),I=109,114) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 46),I=115,120) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=4411,4485) /4096,128,-1024,38,-142,-268,-88,-124, + $ -106,20,2,2,-16,-16,128,1010,1028,-142,1136,-124,992,20,200, + $ -160,1280,20,-160,-268,1010,884,-232,-268,884,-124,-142,20,-160 + $ ,2,-16,-268,-88,-124,992,56,-448,38,-106,20,-160,-124,992,2,20 + $ ,20,-124,-124,56,-16,-160,128,-1024,-16,128,2,20,-16,128,2,-16, + $ -142,-124,2,-16,20,-160/ C 1 T(2,7,5,6,1,3,4) - DATA (CF(I, 47),I= 1, 6) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 47),I= 7, 12) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 47),I= 13, 18) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 47),I= 19, 24) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 47),I= 25, 30) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 47),I= 31, 36) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 47),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 47),I= 43, 48) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 47),I= 49, 54) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 47),I= 55, 60) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 47),I= 61, 66) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 47),I= 67, 72) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 47),I= 73, 78) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 47),I= 79, 84) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 47),I= 85, 90) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 47),I= 91, 96) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 47),I= 97,102) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 47),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 47),I=109,114) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 47),I=115,120) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ + DATA (CF(I),I=4486,4559) /4096,-1024,-88,1028,-232,272,-88,-232 + $ ,56,-124,-124,20,20,2,884,-232,-106,-124,-88,56,-106,-88,200,20 + $ ,1028,-124,-142,1136,1010,1028,992,-124,2,-16,20,2,128,-16,-268 + $ ,-88,38,-142,-106,-124,-160,20,1280,-160,200,20,2,-16,-142,-124 + $ ,-160,20,-16,128,-160,-16,-1024,128,-124,56,20,2,-124,20,-16,2 + $ ,128,-16,20,2/ C 1 T(2,7,6,1,5,3,4) - DATA (CF(I, 48),I= 1, 6) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 48),I= 7, 12) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 48),I= 13, 18) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 48),I= 19, 24) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 48),I= 25, 30) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 48),I= 31, 36) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 48),I= 37, 42) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 48),I= 43, 48) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 48),I= 49, 54) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 48),I= 55, 60) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 48),I= 61, 66) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 48),I= 67, 72) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 48),I= 73, 78) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 48),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 48),I= 85, 90) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 48),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 48),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 48),I=103,108) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 48),I=109,114) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 48),I=115,120) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ + DATA (CF(I),I=4560,4632) /4096,-268,1010,884,-232,-268,884,-124, + $ -142,20,-160,2,-16,-268,-88,-124,992,56,-448,38,-106,20,-160, + $ -124,992,38,-142,-268,-88,-124,-106,20,2,2,-16,-16,128,1010 + $ ,1028,-142,1136,-124,992,20,200,-160,1280,20,-160,20,2,-124,56 + $ ,20,-124,-160,-16,-16,128,128,-1024,-142,-124,2,-16,20,-160,2 + $ ,20,-16,128,2,-16/ C 1 T(2,7,6,5,1,3,4) - DATA (CF(I, 49),I= 1, 6) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 49),I= 7, 12) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 49),I= 13, 18) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 49),I= 19, 24) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 49),I= 25, 30) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 49),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 49),I= 37, 42) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 49),I= 43, 48) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 49),I= 49, 54) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 49),I= 55, 60) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 49),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 49),I= 67, 72) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 49),I= 73, 78) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 49),I= 79, 84) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 49),I= 85, 90) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 49),I= 91, 96) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 49),I= 97,102) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 49),I=103,108) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 49),I=109,114) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 49),I=115,120) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ + DATA (CF(I),I=4633,4704) /4096,-1024,-1024,128,128,1280,-1024 + $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160 + $ ,992,992,-448,992,-124,-160,20,-106,38,-448,56,992,-124,-88, + $ -268,-16,2,-160,20,-142,-124,884,-268,-232,884,1010,-268,-124 + $ ,1028,20,200,-88,-106,56,-88,-124,-106,-232,884,2,20,20,-124, + $ -124,56,-232,-88,272,-232,1028,-88/ C 1 T(5,1,2,6,7,3,4) - DATA (CF(I, 50),I= 1, 6) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 50),I= 7, 12) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 50),I= 13, 18) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 50),I= 19, 24) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 50),I= 25, 30) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 50),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 50),I= 37, 42) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 50),I= 43, 48) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 50),I= 49, 54) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 50),I= 55, 60) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 50),I= 61, 66) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 50),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 50),I= 73, 78) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 50),I= 79, 84) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 50),I= 85, 90) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 50),I= 91, 96) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 50),I= 97,102) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 50),I=103,108) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 50),I=109,114) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 50),I=115,120) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ + DATA (CF(I),I=4705,4775) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992,-124,1028,20,200,-88,-106,56,-88,-124,-106,-232,884,2,20 + $ ,20,-124,-124,56,-232,-88,272,-232,1028,-88,992,-124,-160,20, + $ -106,38,-448,56,992,-124,-88,-268,-16,2,-160,20,-142,-124,884, + $ -268,-232,884,1010,-268/ C 1 T(5,1,2,7,6,3,4) - DATA (CF(I, 51),I= 1, 6) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 51),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 51),I= 13, 18) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 51),I= 19, 24) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 51),I= 25, 30) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 51),I= 31, 36) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 51),I= 37, 42) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 51),I= 43, 48) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 51),I= 49, 54) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 51),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 51),I= 61, 66) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 51),I= 67, 72) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 51),I= 73, 78) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 51),I= 79, 84) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 51),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 51),I= 91, 96) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 51),I= 97,102) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 51),I=103,108) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 51),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 51),I=115,120) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ + DATA (CF(I),I=4776,4845) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992 + $ ,-160,20,1280,-160,200,20,992,-124,1136,-142,1028,1010,128,-16, + $ -16,2,2,20,-106,-124,-88,-268,-142,38,-106,-88,200,20,1028,-124 + $ ,-88,-232,1028,-88,272,-232,20,2,-124,56,20,-124,-88,56,-232 + $ ,884,-124,-106/ C 1 T(5,1,6,2,7,3,4) - DATA (CF(I, 52),I= 1, 6) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 52),I= 7, 12) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 52),I= 13, 18) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 52),I= 19, 24) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 52),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 52),I= 31, 36) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 52),I= 37, 42) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 52),I= 43, 48) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 52),I= 49, 54) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 52),I= 55, 60) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 52),I= 61, 66) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 52),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 52),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 52),I= 79, 84) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 52),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 52),I= 91, 96) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 52),I= 97,102) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 52),I=103,108) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 52),I=109,114) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 52),I=115,120) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ + DATA (CF(I),I=4846,4914) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,20 + $ ,200,-160,1280,20,-160,-124,-106,-142,38,-88,-268,-16,128,2,20, + $ -16,2,-124,992,1028,1010,1136,-142,38,-106,20,-160,-124,992, + $ -268,884,1010,-268,-232,884,2,-16,-142,-124,-160,20,56,-448,-88 + $ ,-268,992,-124/ C 1 T(5,1,6,7,2,3,4) - DATA (CF(I, 53),I= 1, 6) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 53),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 53),I= 13, 18) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 53),I= 19, 24) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 53),I= 25, 30) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 53),I= 31, 36) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 53),I= 37, 42) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 53),I= 43, 48) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 53),I= 49, 54) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 53),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 53),I= 61, 66) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 53),I= 67, 72) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 53),I= 73, 78) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 53),I= 79, 84) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 53),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 53),I= 91, 96) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 53),I= 97,102) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 53),I=103,108) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 53),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 53),I=115,120) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ + DATA (CF(I),I=4915,4982) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,-106,-88 + $ ,200,20,1028,-124,-88,-232,1028,-88,272,-232,20,2,-124,56,20, + $ -124,-88,56,-232,884,-124,-106,-160,20,1280,-160,200,20,992, + $ -124,1136,-142,1028,1010,128,-16,-16,2,2,20,-106,-124,-88,-268, + $ -142,38/ C 1 T(5,1,7,2,6,3,4) - DATA (CF(I, 54),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 54),I= 7, 12) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 54),I= 13, 18) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 54),I= 19, 24) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 54),I= 25, 30) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 54),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 54),I= 37, 42) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 54),I= 43, 48) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 54),I= 49, 54) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 54),I= 55, 60) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 54),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 54),I= 67, 72) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 54),I= 73, 78) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 54),I= 79, 84) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 54),I= 85, 90) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 54),I= 91, 96) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 54),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 54),I=103,108) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 54),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 54),I=115,120) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ + DATA (CF(I),I=4983,5049) /4096,-160,-16,992,-448,-160,992,-16 + $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,38,-106,20, + $ -160,-124,992,-268,884,1010,-268,-232,884,2,-16,-142,-124,-160 + $ ,20,56,-448,-88,-268,992,-124,20,200,-160,1280,20,-160,-124, + $ -106,-142,38,-88,-268,-16,128,2,20,-16,2,-124,992,1028,1010 + $ ,1136,-142/ C 1 T(5,1,7,6,2,3,4) - DATA (CF(I, 55),I= 1, 6) /1.975308641975309D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 55),I= 7, 12) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 55),I= 13, 18) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 55),I= 19, 24) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 55),I= 25, 30) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 55),I= 31, 36) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 55),I= 37, 42) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 55),I= 43, 48) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 55),I= 49, 54) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 55),I= 55, 60) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 55),I= 61, 66) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 55),I= 67, 72) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 55),I= 73, 78) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 55),I= 79, 84) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 55),I= 85, 90) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 55),I= 91, 96) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 55),I= 97,102) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 55),I=103,108) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 55),I=109,114) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 55),I=115,120) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ + DATA (CF(I),I=5050,5115) /4096,-1024,-1024,128,128,1280,1280, + $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,-448,56,992, + $ -124,-88,-268,992,-124,-160,20,-106,38,-160,20,-16,2,-124,-142, + $ -232,884,884,-268,-268,1010,56,-88,-124,-106,-232,884,-124,1028 + $ ,20,200,-88,-106,20,-124,2,20,56,-124,272,-232,-232,-88,-88 + $ ,1028/ C 1 T(5,2,1,6,7,3,4) - DATA (CF(I, 56),I= 1, 6) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-2.469135802469136D-01,3.086419753086420D-02/ - DATA (CF(I, 56),I= 7, 12) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 56),I= 13, 18) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 56),I= 19, 24) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 56),I= 25, 30) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 56),I= 31, 36) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 56),I= 37, 42) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 56),I= 43, 48) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 56),I= 49, 54) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 56),I= 55, 60) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 56),I= 61, 66) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 56),I= 67, 72) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 56),I= 73, 78) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 56),I= 79, 84) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 56),I= 85, 90) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 56),I= 91, 96) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 56),I= 97,102) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 56),I=103,108) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 56),I=109,114) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 56),I=115,120) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ + DATA (CF(I),I=5116,5180) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136,56,-88,-124,-106,-232 + $ ,884,-124,1028,20,200,-88,-106,20,-124,2,20,56,-124,272,-232, + $ -232,-88,-88,1028,-448,56,992,-124,-88,-268,992,-124,-160,20, + $ -106,38,-160,20,-16,2,-124,-142,-232,884,884,-268,-268,1010/ C 1 T(5,2,1,7,6,3,4) - DATA (CF(I, 57),I= 1, 6) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -1.635802469135803D-01/ - DATA (CF(I, 57),I= 7, 12) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 57),I= 13, 18) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 57),I= 19, 24) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 57),I= 25, 30) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 57),I= 31, 36) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 57),I= 37, 42) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 57),I= 43, 48) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 57),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 57),I= 55, 60) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 57),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 57),I= 67, 72) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 57),I= 73, 78) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 57),I= 79, 84) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 57),I= 85, 90) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 57),I= 91, 96) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 57),I= 97,102) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 57),I=103,108) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 57),I=109,114) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 57),I=115,120) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ + DATA (CF(I),I=5181,5244) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160,992,-124,1136,-142,1028 + $ ,1010,-160,20,1280,-160,200,20,-16,2,128,-16,20,2,-88,-268,-106 + $ ,-124,38,-142,-88,-232,1028,-88,272,-232,-106,-88,200,20,1028, + $ -124,-124,56,20,2,-124,20,-232,884,-88,56,-106,-124/ C 1 T(5,2,6,1,7,3,4) - DATA (CF(I, 58),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00,-1.635802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 58),I= 7, 12) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 58),I= 13, 18) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 58),I= 19, 24) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 58),I= 25, 30) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 58),I= 31, 36) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 58),I= 37, 42) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 58),I= 43, 48) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 58),I= 49, 54) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 58),I= 55, 60) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 58),I= 61, 66) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 58),I= 67, 72) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 58),I= 73, 78) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 58),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 58),I= 85, 90) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 58),I= 91, 96) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 58),I= 97,102) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 58),I=103,108) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 58),I=109,114) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 58),I=115,120) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=5245,5307) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280,-124,-106,-142,38,-88,-268,20 + $ ,200,-160,1280,20,-160,2,20,-16,128,2,-16,1028,1010,-124,992, + $ -142,1136,-268,884,1010,-268,-232,884,38,-106,20,-160,-124,992, + $ -142,-124,2,-16,20,-160,-88,-268,56,-448,-124,992/ C 1 T(5,2,6,7,1,3,4) - DATA (CF(I, 59),I= 1, 6) /3.086419753086420D-02, - $ -2.469135802469136D-01,5.864197530864197D-02, - $ -1.635802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 59),I= 7, 12) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 59),I= 13, 18) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 59),I= 19, 24) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 59),I= 25, 30) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 59),I= 31, 36) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 59),I= 37, 42) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 59),I= 43, 48) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 59),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 59),I= 55, 60) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 59),I= 61, 66) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 59),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 59),I= 73, 78) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 59),I= 79, 84) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 59),I= 85, 90) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 59),I= 91, 96) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 59),I= 97,102) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 59),I=103,108) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 59),I=109,114) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 59),I=115,120) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ + DATA (CF(I),I=5308,5369) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16,-88,-232,1028,-88,272,-232,-106,-88 + $ ,200,20,1028,-124,-124,56,20,2,-124,20,-232,884,-88,56,-106, + $ -124,992,-124,1136,-142,1028,1010,-160,20,1280,-160,200,20,-16 + $ ,2,128,-16,20,2,-88,-268,-106,-124,38,-142/ C 1 T(5,2,7,1,6,3,4) - DATA (CF(I, 60),I= 1, 6) /3.086419753086420D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 60),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 60),I= 13, 18) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 60),I= 19, 24) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 60),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 60),I= 31, 36) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 60),I= 37, 42) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 60),I= 43, 48) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 60),I= 49, 54) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 60),I= 55, 60) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 60),I= 61, 66) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 60),I= 67, 72) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 60),I= 73, 78) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 60),I= 79, 84) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 60),I= 85, 90) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 60),I= 91, 96) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 60),I= 97,102) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 60),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 60),I=109,114) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 60),I=115,120) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ + DATA (CF(I),I=5370,5430) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128,-268,884,1010,-268,-232,884,38,-106,20, + $ -160,-124,992,-142,-124,2,-16,20,-160,-88,-268,56,-448,-124,992 + $ ,-124,-106,-142,38,-88,-268,20,200,-160,1280,20,-160,2,20,-16 + $ ,128,2,-16,1028,1010,-124,992,-142,1136/ C 1 T(5,2,7,6,1,3,4) - DATA (CF(I, 61),I= 1, 6) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 61),I= 7, 12) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 61),I= 13, 18) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 61),I= 19, 24) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 61),I= 25, 30) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 61),I= 31, 36) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 61),I= 37, 42) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 61),I= 43, 48) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 61),I= 49, 54) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 61),I= 55, 60) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 61),I= 61, 66) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 61),I= 67, 72) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 61),I= 73, 78) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 61),I= 79, 84) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 61),I= 85, 90) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 61),I= 91, 96) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 61),I= 97,102) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 61),I=103,108) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 61),I=109,114) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 61),I=115,120) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ + DATA (CF(I),I=5431,5490) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160,-16,2,128,-16,20,2,-160,20,-16,2,-124,-142, + $ -1024,128,128,-16,-16,-160,-124,20,56,-124,2,20,884,-232,-106, + $ -124,-88,56,-232,272,-88,1028,-232,-88,-124,20,56,-124,2,20 + $ ,1028,-124,-88,-106,20,200/ C 1 T(5,6,1,2,7,3,4) - DATA (CF(I, 62),I= 1, 6) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 62),I= 7, 12) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 62),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 62),I= 19, 24) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 62),I= 25, 30) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 62),I= 31, 36) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 62),I= 37, 42) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 62),I= 43, 48) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 62),I= 49, 54) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 62),I= 55, 60) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 62),I= 61, 66) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 62),I= 67, 72) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 62),I= 73, 78) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 62),I= 79, 84) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 62),I= 85, 90) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 62),I= 91, 96) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 62),I= 97,102) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 62),I=103,108) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 62),I=109,114) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 62),I=115,120) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ + DATA (CF(I),I=5491,5549) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16,2,20,-16,128,2,-16,20,-124,2,20,56,-124,128,-1024 + $ ,-16,-160,128,-16,20,-160,-124,-142,-16,2,-268,-88,-124,992,56, + $ -448,884,-232,-268,1010,884,-268,20,-160,-124,-142,-16,2,-124 + $ ,992,-106,38,-160,20/ C 1 T(5,6,1,7,2,3,4) - DATA (CF(I, 63),I= 1, 6) /1.530864197530864D+00, - $ -1.913580246913580D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 63),I= 7, 12) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 63),I= 13, 18) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 63),I= 19, 24) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 63),I= 25, 30) /1.753086419753086D+00, - $ -2.191358024691358D-01,1.530864197530864D+00, - $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D - $ +00/ - DATA (CF(I, 63),I= 31, 36) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 63),I= 37, 42) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 63),I= 43, 48) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 63),I= 49, 54) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 63),I= 55, 60) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 63),I= 61, 66) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 63),I= 67, 72) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 63),I= 73, 78) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 63),I= 79, 84) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 63),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 63),I= 91, 96) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 63),I= 97,102) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 63),I=103,108) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 63),I=109,114) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 63),I=115,120) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ + DATA (CF(I),I=5550,5607) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16,-160,20,-16,2,-124,-142,-16,2,128,-16,20,2,128,-16, + $ -1024,128,-160,-16,56,-124,-124,20,20,2,-232,272,-88,1028,-232, + $ -88,884,-232,-106,-124,-88,56,56,-124,-124,20,20,2,-88,-106 + $ ,1028,-124,200,20/ C 1 T(5,6,2,1,7,3,4) - DATA (CF(I, 64),I= 1, 6) /-1.913580246913580D-01, - $ -1.635802469135803D-01,8.641975308641975D-02, - $ -1.358024691358025D-01,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 64),I= 7, 12) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 64),I= 13, 18) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 64),I= 19, 24) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 64),I= 25, 30) /-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01, - $ -1.635802469135803D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 64),I= 31, 36) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 64),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 64),I= 43, 48) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 64),I= 49, 54) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 64),I= 55, 60) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 64),I= 61, 66) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 64),I= 67, 72) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 64),I= 73, 78) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 64),I= 79, 84) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 64),I= 85, 90) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 64),I= 91, 96) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 64),I= 97,102) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 64),I=103,108) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 64),I=109,114) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 64),I=115,120) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=5608,5664) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128,20,-124,2,20,56,-124,2,20,-16,128,2,-16,-16,-160,128,-1024 + $ ,-16,128,-124,-142,20,-160,2,-16,884,-232,-268,1010,884,-268, + $ -268,-88,-124,992,56,-448,-124,-142,20,-160,2,-16,-106,38,-124 + $ ,992,20,-160/ C 1 T(5,6,2,7,1,3,4) - DATA (CF(I, 65),I= 1, 6) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 65),I= 7, 12) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 65),I= 13, 18) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 65),I= 19, 24) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 65),I= 25, 30) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 65),I= 31, 36) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 65),I= 37, 42) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 65),I= 43, 48) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 65),I= 49, 54) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 65),I= 55, 60) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 65),I= 61, 66) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 65),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 65),I= 73, 78) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 65),I= 79, 84) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 65),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 65),I= 91, 96) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 65),I= 97,102) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 65),I=103,108) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 65),I=109,114) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 65),I=115,120) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ + DATA (CF(I),I=5665,5720) /4096,-1024,-16,128,-160,-16,-1024,128, + $ -142,-124,2,-16,20,-160,-124,56,20,2,-124,20,-16,128,-160,-16, + $ -1024,128,2,-16,20,2,128,-16,1010,1028,-142,1136,-124,992,-268, + $ -88,38,-142,-106,-124,2,-16,20,2,128,-16,20,-160,200,20,1280, + $ -160/ C 1 T(5,6,7,1,2,3,4) - DATA (CF(I, 66),I= 1, 6) /1.586419753086420D+00, - $ -1.358024691358025D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01 - $ ,4.197530864197531D-01/ - DATA (CF(I, 66),I= 7, 12) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 66),I= 13, 18) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 66),I= 19, 24) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 66),I= 25, 30) /1.558641975308642D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -3.580246913580247D-01/ - DATA (CF(I, 66),I= 31, 36) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 66),I= 37, 42) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 66),I= 43, 48) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 66),I= 49, 54) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 66),I= 55, 60) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 66),I= 61, 66) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 66),I= 67, 72) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 66),I= 73, 78) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 66),I= 79, 84) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 66),I= 85, 90) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 66),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 66),I= 97,102) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 66),I=103,108) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 66),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 66),I=115,120) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ + DATA (CF(I),I=5721,5775) /4096,-160,-16,-16,128,128,-1024,-124 + $ ,56,20,2,-124,20,-142,-124,2,-16,20,-160,-160,-16,-16,128,128, + $ -1024,20,2,2,-16,-16,128,-268,-88,38,-142,-106,-124,1010,1028, + $ -142,1136,-124,992,20,2,2,-16,-16,128,200,20,20,-160,-160,1280/ C 1 T(5,6,7,2,1,3,4) - DATA (CF(I, 67),I= 1, 6) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 67),I= 7, 12) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 67),I= 13, 18) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 67),I= 19, 24) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 67),I= 25, 30) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 67),I= 31, 36) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 67),I= 37, 42) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 67),I= 43, 48) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 67),I= 49, 54) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 67),I= 55, 60) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 67),I= 61, 66) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 67),I= 67, 72) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 67),I= 73, 78) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 67),I= 79, 84) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 67),I= 85, 90) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 67),I= 91, 96) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 67),I= 97,102) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 67),I=103,108) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 67),I=109,114) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 67),I=115,120) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ + DATA (CF(I),I=5776,5829) /4096,-1024,-1024,128,128,1280,884,-232 + $ ,-106,-124,-88,56,-232,272,-88,1028,-232,-88,-124,20,56,-124,2 + $ ,20,1028,-124,-88,-106,20,200,-16,2,128,-16,20,2,-160,20,-16,2, + $ -124,-142,-1024,128,128,-16,-16,-160,-124,20,56,-124,2,20/ C 1 T(5,7,1,2,6,3,4) - DATA (CF(I, 68),I= 1, 6) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 68),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 68),I= 13, 18) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 68),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 68),I= 25, 30) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 68),I= 31, 36) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 68),I= 37, 42) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 68),I= 43, 48) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 68),I= 49, 54) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 68),I= 55, 60) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 68),I= 61, 66) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 68),I= 67, 72) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 68),I= 73, 78) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 68),I= 79, 84) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 68),I= 85, 90) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 68),I= 91, 96) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 68),I= 97,102) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 68),I=103,108) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 68),I=109,114) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 68),I=115,120) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ + DATA (CF(I),I=5830,5882) /4096,128,1280,-1024,128,-268,-88,-124 + $ ,992,56,-448,884,-232,-268,1010,884,-268,20,-160,-124,-142,-16 + $ ,2,-124,992,-106,38,-160,20,2,20,-16,128,2,-16,20,-124,2,20,56, + $ -124,128,-1024,-16,-160,128,-16,20,-160,-124,-142,-16,2/ C 1 T(5,7,1,6,2,3,4) - DATA (CF(I, 69),I= 1, 6) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-4.135802469135803D-01, - $ -1.358024691358025D-01,-6.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 69),I= 7, 12) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 69),I= 13, 18) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 69),I= 19, 24) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 69),I= 25, 30) /-2.191358024691358D-01 - $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D - $ +00,1.530864197530864D+00,-1.913580246913580D-01/ - DATA (CF(I, 69),I= 31, 36) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 69),I= 37, 42) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 69),I= 43, 48) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 69),I= 49, 54) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 69),I= 55, 60) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 69),I= 61, 66) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 69),I= 67, 72) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 69),I= 73, 78) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 69),I= 79, 84) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 69),I= 85, 90) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 69),I= 91, 96) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 69),I= 97,102) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 69),I=103,108) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 69),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 69),I=115,120) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ + DATA (CF(I),I=5883,5934) /4096,-1024,1280,128,-232,272,-88,1028, + $ -232,-88,884,-232,-106,-124,-88,56,56,-124,-124,20,20,2,-88, + $ -106,1028,-124,200,20,-160,20,-16,2,-124,-142,-16,2,128,-16,20 + $ ,2,128,-16,-1024,128,-160,-16,56,-124,-124,20,20,2/ C 1 T(5,7,2,1,6,3,4) - DATA (CF(I, 70),I= 1, 6) /-1.635802469135803D-01, - $ -1.913580246913580D-01,1.364197530864198D+00, - $ -3.580246913580247D-01,8.641975308641975D-02, - $ -1.358024691358025D-01/ - DATA (CF(I, 70),I= 7, 12) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 70),I= 13, 18) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 70),I= 19, 24) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 70),I= 25, 30) /5.864197530864197D-02, - $ -2.191358024691358D-01,-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 70),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 70),I= 37, 42) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 70),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 70),I= 49, 54) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 70),I= 55, 60) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 70),I= 61, 66) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 70),I= 67, 72) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 70),I= 73, 78) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 70),I= 79, 84) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 70),I= 85, 90) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 70),I= 91, 96) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 70),I= 97,102) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 70),I=103,108) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 70),I=109,114) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 70),I=115,120) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ + DATA (CF(I),I=5935,5985) /4096,128,-1024,884,-232,-268,1010,884, + $ -268,-268,-88,-124,992,56,-448,-124,-142,20,-160,2,-16,-106,38, + $ -124,992,20,-160,20,-124,2,20,56,-124,2,20,-16,128,2,-16,-16, + $ -160,128,-1024,-16,128,-124,-142,20,-160,2,-16/ C 1 T(5,7,2,6,1,3,4) - DATA (CF(I, 71),I= 1, 6) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 71),I= 7, 12) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 71),I= 13, 18) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 71),I= 19, 24) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 71),I= 25, 30) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 71),I= 31, 36) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 71),I= 37, 42) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 71),I= 43, 48) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 71),I= 49, 54) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 71),I= 55, 60) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 71),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 71),I= 67, 72) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 71),I= 73, 78) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 71),I= 79, 84) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 71),I= 85, 90) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 71),I= 91, 96) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 71),I= 97,102) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 71),I=103,108) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 71),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 71),I=115,120) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ + DATA (CF(I),I=5986,6035) /4096,-1024,1010,1028,-142,1136,-124 + $ ,992,-268,-88,38,-142,-106,-124,2,-16,20,2,128,-16,20,-160,200 + $ ,20,1280,-160,-142,-124,2,-16,20,-160,-124,56,20,2,-124,20,-16 + $ ,128,-160,-16,-1024,128,2,-16,20,2,128,-16/ C 1 T(5,7,6,1,2,3,4) - DATA (CF(I, 72),I= 1, 6) /-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 72),I= 7, 12) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 72),I= 13, 18) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 72),I= 19, 24) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 72),I= 25, 30) /-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 72),I= 31, 36) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 72),I= 37, 42) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 72),I= 43, 48) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 72),I= 49, 54) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 72),I= 55, 60) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 72),I= 61, 66) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 72),I= 67, 72) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 72),I= 73, 78) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 72),I= 79, 84) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 72),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 72),I= 91, 96) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 72),I= 97,102) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 72),I=103,108) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 72),I=109,114) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 72),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=6036,6084) /4096,-268,-88,38,-142,-106,-124,1010 + $ ,1028,-142,1136,-124,992,20,2,2,-16,-16,128,200,20,20,-160,-160 + $ ,1280,-124,56,20,2,-124,20,-142,-124,2,-16,20,-160,-160,-16,-16 + $ ,128,128,-1024,20,2,2,-16,-16,128/ C 1 T(5,7,6,2,1,3,4) - DATA (CF(I, 73),I= 1, 6) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 73),I= 7, 12) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 73),I= 13, 18) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 73),I= 19, 24) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 73),I= 25, 30) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 73),I= 31, 36) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 73),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 73),I= 43, 48) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 73),I= 49, 54) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 73),I= 55, 60) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 73),I= 61, 66) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 73),I= 67, 72) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 73),I= 73, 78) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 73),I= 79, 84) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 73),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 73),I= 91, 96) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 73),I= 97,102) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 73),I=103,108) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 73),I=109,114) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 73),I=115,120) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ + DATA (CF(I),I=6085,6132) /4096,-1024,-1024,128,128,1280,-1024 + $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160 + $ ,992,992,-448,1028,-124,-88,-106,20,200,-88,56,-232,884,-124, + $ -106,-232,-88,272,-232,1028,-88,2,20,20,-124,-124,56/ C 1 T(6,1,2,5,7,3,4) - DATA (CF(I, 74),I= 1, 6) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 74),I= 7, 12) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 74),I= 13, 18) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 74),I= 19, 24) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 74),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 74),I= 31, 36) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 74),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 74),I= 43, 48) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 74),I= 49, 54) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 74),I= 55, 60) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 74),I= 61, 66) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 74),I= 67, 72) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 74),I= 73, 78) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 74),I= 79, 84) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 74),I= 85, 90) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 74),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 74),I= 97,102) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 74),I=103,108) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 74),I=109,114) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 74),I=115,120) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ + DATA (CF(I),I=6133,6179) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992,-124,992,-106,38,-160,20,56,-448,-88,-268,992,-124,884, + $ -268,-232,884,1010,-268,-16,2,-160,20,-142,-124/ C 1 T(6,1,2,7,5,3,4) - DATA (CF(I, 75),I= 1, 6) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 75),I= 7, 12) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 75),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 75),I= 19, 24) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 75),I= 25, 30) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 75),I= 31, 36) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 75),I= 37, 42) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 75),I= 43, 48) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 75),I= 49, 54) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 75),I= 55, 60) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 75),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 75),I= 67, 72) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 75),I= 73, 78) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 75),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 75),I= 85, 90) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 75),I= 91, 96) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 75),I= 97,102) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 75),I=103,108) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 75),I=109,114) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 75),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ + DATA (CF(I),I=6180,6225) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992 + $ ,-88,-106,1028,-124,200,20,-232,-88,272,-232,1028,-88,-88,56, + $ -232,884,-124,-106,20,2,-124,56,20,-124/ C 1 T(6,1,5,2,7,3,4) - DATA (CF(I, 76),I= 1, 6) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 76),I= 7, 12) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 76),I= 13, 18) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 76),I= 19, 24) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 76),I= 25, 30) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 76),I= 31, 36) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 76),I= 37, 42) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 76),I= 43, 48) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 76),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 76),I= 55, 60) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 76),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 76),I= 67, 72) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 76),I= 73, 78) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 76),I= 79, 84) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 76),I= 85, 90) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 76),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 76),I= 97,102) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 76),I=103,108) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 76),I=109,114) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 76),I=115,120) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ + DATA (CF(I),I=6226,6270) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,-106 + $ ,38,-124,992,20,-160,884,-268,-232,884,1010,-268,56,-448,-88, + $ -268,992,-124,2,-16,-142,-124,-160,20/ C 1 T(6,1,5,7,2,3,4) - DATA (CF(I, 77),I= 1, 6) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 77),I= 7, 12) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 77),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 77),I= 19, 24) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 77),I= 25, 30) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 77),I= 31, 36) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 77),I= 37, 42) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 77),I= 43, 48) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 77),I= 49, 54) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 77),I= 55, 60) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 77),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 77),I= 67, 72) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 77),I= 73, 78) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 77),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 77),I= 85, 90) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 77),I= 91, 96) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 77),I= 97,102) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 77),I=103,108) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 77),I=109,114) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 77),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ + DATA (CF(I),I=6271,6314) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,20,-160 + $ ,200,20,1280,-160,-124,992,1028,1010,1136,-142,-106,-124,-88, + $ -268,-142,38,128,-16,-16,2,2,20/ C 1 T(6,1,7,2,5,3,4) - DATA (CF(I, 78),I= 1, 6) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 78),I= 7, 12) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 78),I= 13, 18) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 78),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 78),I= 25, 30) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 78),I= 31, 36) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 78),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 78),I= 43, 48) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 78),I= 49, 54) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 78),I= 55, 60) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 78),I= 61, 66) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 78),I= 67, 72) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 78),I= 73, 78) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 78),I= 79, 84) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 78),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 78),I= 91, 96) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 78),I= 97,102) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 78),I=103,108) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 78),I=109,114) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 78),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ + DATA (CF(I),I=6315,6357) /4096,-160,-16,992,-448,-160,992,-16 + $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,200,20,20, + $ -160,-160,1280,-106,-124,-88,-268,-142,38,-124,992,1028,1010 + $ ,1136,-142,-16,128,2,20,-16,2/ C 1 T(6,1,7,5,2,3,4) - DATA (CF(I, 79),I= 1, 6) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 79),I= 7, 12) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 79),I= 13, 18) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 79),I= 19, 24) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 79),I= 25, 30) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 79),I= 31, 36) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 79),I= 37, 42) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 79),I= 43, 48) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 79),I= 49, 54) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 79),I= 55, 60) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 79),I= 61, 66) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 79),I= 67, 72) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 79),I= 73, 78) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 79),I= 79, 84) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 79),I= 85, 90) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 79),I= 91, 96) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 79),I= 97,102) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 79),I=103,108) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 79),I=109,114) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 79),I=115,120) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ + DATA (CF(I),I=6358,6399) /4096,-1024,-1024,128,128,1280,1280, + $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,-88,56,-232 + $ ,884,-124,-106,1028,-124,-88,-106,20,200,272,-232,-232,-88,-88 + $ ,1028,20,-124,2,20,56,-124/ C 1 T(6,2,1,5,7,3,4) - DATA (CF(I, 80),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 80),I= 7, 12) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 80),I= 13, 18) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 80),I= 19, 24) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 80),I= 25, 30) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 80),I= 31, 36) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 80),I= 37, 42) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 80),I= 43, 48) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 80),I= 49, 54) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 80),I= 55, 60) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 80),I= 61, 66) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 80),I= 67, 72) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 80),I= 73, 78) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 80),I= 79, 84) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 80),I= 85, 90) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 80),I= 91, 96) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 80),I= 97,102) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 80),I=103,108) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 80),I=109,114) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 80),I=115,120) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ + DATA (CF(I),I=6400,6440) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136,56,-448,-88,-268,992, + $ -124,-124,992,-106,38,-160,20,-232,884,884,-268,-268,1010,-160 + $ ,20,-16,2,-124,-142/ C 1 T(6,2,1,7,5,3,4) - DATA (CF(I, 81),I= 1, 6) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 81),I= 7, 12) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 81),I= 13, 18) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 81),I= 19, 24) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 81),I= 25, 30) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 81),I= 31, 36) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 81),I= 37, 42) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 81),I= 43, 48) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 81),I= 49, 54) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 81),I= 55, 60) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 81),I= 61, 66) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 81),I= 67, 72) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 81),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 81),I= 79, 84) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 81),I= 85, 90) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 81),I= 91, 96) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 81),I= 97,102) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 81),I=103,108) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 81),I=109,114) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 81),I=115,120) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ + DATA (CF(I),I=6441,6480) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160,-232,-88,272,-232,1028,-88, + $ -88,-106,1028,-124,200,20,-232,884,-88,56,-106,-124,-124,56,20 + $ ,2,-124,20/ C 1 T(6,2,5,1,7,3,4) - DATA (CF(I, 82),I= 1, 6) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 82),I= 7, 12) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 82),I= 13, 18) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 82),I= 19, 24) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 82),I= 25, 30) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 82),I= 31, 36) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 82),I= 37, 42) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 82),I= 43, 48) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 82),I= 49, 54) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 82),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 82),I= 61, 66) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 82),I= 67, 72) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 82),I= 73, 78) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 82),I= 79, 84) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 82),I= 85, 90) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 82),I= 91, 96) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 82),I= 97,102) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 82),I=103,108) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 82),I=109,114) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 82),I=115,120) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=6481,6519) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280,884,-268,-232,884,1010,-268, + $ -106,38,-124,992,20,-160,-88,-268,56,-448,-124,992,-142,-124,2, + $ -16,20,-160/ C 1 T(6,2,5,7,1,3,4) - DATA (CF(I, 83),I= 1, 6) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 83),I= 7, 12) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 83),I= 13, 18) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 83),I= 19, 24) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 83),I= 25, 30) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 83),I= 31, 36) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 83),I= 37, 42) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 83),I= 43, 48) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 83),I= 49, 54) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 83),I= 55, 60) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 83),I= 61, 66) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 83),I= 67, 72) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 83),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 83),I= 79, 84) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 83),I= 85, 90) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 83),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 83),I= 97,102) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 83),I=103,108) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 83),I=109,114) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 83),I=115,120) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ + DATA (CF(I),I=6520,6557) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16,-124,992,1028,1010,1136,-142,20, + $ -160,200,20,1280,-160,-88,-268,-106,-124,38,-142,-16,2,128,-16 + $ ,20,2/ C 1 T(6,2,7,1,5,3,4) - DATA (CF(I, 84),I= 1, 6) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 84),I= 7, 12) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 84),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 84),I= 19, 24) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 84),I= 25, 30) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 84),I= 31, 36) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 84),I= 37, 42) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 84),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 84),I= 49, 54) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 84),I= 55, 60) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 84),I= 61, 66) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 84),I= 67, 72) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 84),I= 73, 78) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 84),I= 79, 84) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 84),I= 85, 90) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 84),I= 91, 96) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 84),I= 97,102) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 84),I=103,108) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 84),I=109,114) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 84),I=115,120) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ + DATA (CF(I),I=6558,6594) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128,-106,-124,-88,-268,-142,38,200,20,20, + $ -160,-160,1280,1028,1010,-124,992,-142,1136,2,20,-16,128,2,-16/ C 1 T(6,2,7,5,1,3,4) - DATA (CF(I, 85),I= 1, 6) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 85),I= 7, 12) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 85),I= 13, 18) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 85),I= 19, 24) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 85),I= 25, 30) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 85),I= 31, 36) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 85),I= 37, 42) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 85),I= 43, 48) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 85),I= 49, 54) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 85),I= 55, 60) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 85),I= 61, 66) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 85),I= 67, 72) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 85),I= 73, 78) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 85),I= 79, 84) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 85),I= 85, 90) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 85),I= 91, 96) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 85),I= 97,102) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 85),I=103,108) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 85),I=109,114) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 85),I=115,120) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ + DATA (CF(I),I=6595,6630) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160,-232,884,-88,56,-106,-124,272,-232,-232,-88, + $ -88,1028,1028,-124,-88,-106,20,200,-124,20,56,-124,2,20/ C 1 T(6,5,1,2,7,3,4) - DATA (CF(I, 86),I= 1, 6) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 86),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 86),I= 13, 18) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 86),I= 19, 24) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 86),I= 25, 30) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 86),I= 31, 36) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 86),I= 37, 42) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 86),I= 43, 48) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 86),I= 49, 54) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 86),I= 55, 60) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 86),I= 61, 66) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 86),I= 67, 72) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 86),I= 73, 78) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 86),I= 79, 84) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 86),I= 85, 90) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 86),I= 91, 96) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 86),I= 97,102) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 86),I=103,108) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 86),I=109,114) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 86),I=115,120) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ + DATA (CF(I),I=6631,6665) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16,-88,-268,56,-448,-124,992,-232,884,884,-268,-268 + $ ,1010,-124,992,-106,38,-160,20,20,-160,-124,-142,-16,2/ C 1 T(6,5,1,7,2,3,4) - DATA (CF(I, 87),I= 1, 6) /-6.913580246913580D-01 - $ ,8.641975308641975D-02,1.530864197530864D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 87),I= 7, 12) /1.530864197530864D+00, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-1.635802469135803D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 87),I= 13, 18) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 87),I= 19, 24) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 87),I= 25, 30) /1.530864197530864D+00, - $ -1.913580246913580D-01,1.753086419753086D+00, - $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D - $ +00/ - DATA (CF(I, 87),I= 31, 36) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 87),I= 37, 42) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 87),I= 43, 48) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 87),I= 49, 54) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I, 87),I= 55, 60) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 87),I= 61, 66) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 87),I= 67, 72) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 87),I= 73, 78) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I, 87),I= 79, 84) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 87),I= 85, 90) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 87),I= 91, 96) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 87),I= 97,102) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 87),I=103,108) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 87),I=109,114) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 87),I=115,120) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ + DATA (CF(I),I=6666,6699) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16,272,-232,-232,-88,-88,1028,-232,884,-88,56,-106,-124, + $ -88,-106,1028,-124,200,20,56,-124,-124,20,20,2/ C 1 T(6,5,2,1,7,3,4) - DATA (CF(I, 88),I= 1, 6) /8.641975308641975D-02, - $ -1.358024691358025D-01,-1.913580246913580D-01, - $ -1.635802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 88),I= 7, 12) /-1.913580246913580D-01 - $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D - $ -01,-1.358024691358025D-01,-1.635802469135803D-01/ - DATA (CF(I, 88),I= 13, 18) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 88),I= 19, 24) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I, 88),I= 25, 30) /-1.913580246913580D-01, - $ -1.635802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02,-1.358024691358025D-01, - $ -4.135802469135803D-01/ - DATA (CF(I, 88),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 88),I= 37, 42) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 88),I= 43, 48) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 88),I= 49, 54) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I, 88),I= 55, 60) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 88),I= 61, 66) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 88),I= 67, 72) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 88),I= 73, 78) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 88),I= 79, 84) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 88),I= 85, 90) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 88),I= 91, 96) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 88),I= 97,102) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I, 88),I=103,108) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 88),I=109,114) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 88),I=115,120) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ + DATA (CF(I),I=6700,6732) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128,-232,884,884,-268,-268,1010,-88,-268,56,-448,-124,992,-106 + $ ,38,-124,992,20,-160,-124,-142,20,-160,2,-16/ C 1 T(6,5,2,7,1,3,4) - DATA (CF(I, 89),I= 1, 6) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 89),I= 7, 12) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 89),I= 13, 18) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 89),I= 19, 24) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 89),I= 25, 30) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 89),I= 31, 36) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 89),I= 37, 42) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 89),I= 43, 48) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 89),I= 49, 54) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 89),I= 55, 60) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 89),I= 61, 66) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 89),I= 67, 72) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 89),I= 73, 78) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 89),I= 79, 84) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 89),I= 85, 90) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 89),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 89),I= 97,102) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 89),I=103,108) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 89),I=109,114) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 89),I=115,120) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ + DATA (CF(I),I=6733,6764) /4096,-1024,-16,128,-160,-16,-1024,128 + $ ,1028,1010,-124,992,-142,1136,-88,-268,-106,-124,38,-142,20, + $ -160,200,20,1280,-160,2,-16,20,2,128,-16/ C 1 T(6,5,7,1,2,3,4) - DATA (CF(I, 90),I= 1, 6) /-1.358024691358025D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01/ - DATA (CF(I, 90),I= 7, 12) /-1.635802469135803D-01, - $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.586419753086420D+00,-1.913580246913580D-01/ - DATA (CF(I, 90),I= 13, 18) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 90),I= 19, 24) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 90),I= 25, 30) /-4.135802469135803D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00/ - DATA (CF(I, 90),I= 31, 36) /5.864197530864197D-02, - $ -1.635802469135803D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 90),I= 37, 42) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 90),I= 43, 48) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 90),I= 49, 54) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I, 90),I= 55, 60) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 90),I= 61, 66) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 90),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 90),I= 73, 78) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I, 90),I= 79, 84) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 90),I= 85, 90) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 90),I= 91, 96) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 90),I= 97,102) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I, 90),I=103,108) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I, 90),I=109,114) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 90),I=115,120) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=6765,6795) /4096,-160,-16,-16,128,128,-1024,-88, + $ -268,-106,-124,38,-142,1028,1010,-124,992,-142,1136,200,20,20, + $ -160,-160,1280,20,2,2,-16,-16,128/ C 1 T(6,5,7,2,1,3,4) - DATA (CF(I, 91),I= 1, 6) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 91),I= 7, 12) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 91),I= 13, 18) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 91),I= 19, 24) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 91),I= 25, 30) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 91),I= 31, 36) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 91),I= 37, 42) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 91),I= 43, 48) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 91),I= 49, 54) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 91),I= 55, 60) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 91),I= 61, 66) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 91),I= 67, 72) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 91),I= 73, 78) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 91),I= 79, 84) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 91),I= 85, 90) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 91),I= 91, 96) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 91),I= 97,102) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 91),I=103,108) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 91),I=109,114) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 91),I=115,120) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=6796,6825) /4096,-1024,-1024,128,128,1280,2,-16,20 + $ ,2,128,-16,20,-160,-124,-142,-16,2,-124,20,56,-124,2,20,-1024 + $ ,128,128,-16,-16,-160/ C 1 T(6,7,1,2,5,3,4) - DATA (CF(I, 92),I= 1, 6) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 92),I= 7, 12) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 92),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 92),I= 19, 24) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 92),I= 25, 30) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 92),I= 31, 36) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 92),I= 37, 42) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 92),I= 43, 48) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 92),I= 49, 54) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 92),I= 55, 60) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 92),I= 61, 66) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 92),I= 67, 72) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 92),I= 73, 78) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 92),I= 79, 84) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 92),I= 85, 90) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 92),I= 91, 96) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 92),I= 97,102) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 92),I=103,108) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 92),I=109,114) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 92),I=115,120) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=6826,6854) /4096,128,1280,-1024,128,20,2,2,-16,-16 + $ ,128,-124,20,56,-124,2,20,20,-160,-124,-142,-16,2,128,-1024,-16 + $ ,-160,128,-16/ C 1 T(6,7,1,5,2,3,4) - DATA (CF(I, 93),I= 1, 6) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 93),I= 7, 12) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 93),I= 13, 18) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 93),I= 19, 24) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 93),I= 25, 30) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 93),I= 31, 36) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 93),I= 37, 42) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 93),I= 43, 48) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 93),I= 49, 54) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 93),I= 55, 60) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 93),I= 61, 66) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 93),I= 67, 72) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 93),I= 73, 78) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 93),I= 79, 84) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 93),I= 85, 90) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 93),I= 91, 96) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 93),I= 97,102) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 93),I=103,108) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 93),I=109,114) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 93),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=6855,6882) /4096,-1024,1280,128,20,-160,-124,-142, + $ -16,2,2,-16,20,2,128,-16,56,-124,-124,20,20,2,128,-16,-1024,128 + $ ,-160,-16/ C 1 T(6,7,2,1,5,3,4) - DATA (CF(I, 94),I= 1, 6) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 94),I= 7, 12) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 94),I= 13, 18) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 94),I= 19, 24) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 94),I= 25, 30) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 94),I= 31, 36) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 94),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 94),I= 43, 48) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 94),I= 49, 54) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 94),I= 55, 60) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 94),I= 61, 66) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 94),I= 67, 72) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 94),I= 73, 78) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 94),I= 79, 84) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 94),I= 85, 90) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 94),I= 91, 96) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 94),I= 97,102) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 94),I=103,108) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 94),I=109,114) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 94),I=115,120) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=6883,6909) /4096,128,-1024,-124,20,56,-124,2,20,20 + $ ,2,2,-16,-16,128,-124,-142,20,-160,2,-16,-16,-160,128,-1024,-16 + $ ,128/ C 1 T(6,7,2,5,1,3,4) - DATA (CF(I, 95),I= 1, 6) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 95),I= 7, 12) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 95),I= 13, 18) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 95),I= 19, 24) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 95),I= 25, 30) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 95),I= 31, 36) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 95),I= 37, 42) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 95),I= 43, 48) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 95),I= 49, 54) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 95),I= 55, 60) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 95),I= 61, 66) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 95),I= 67, 72) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 95),I= 73, 78) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 95),I= 79, 84) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 95),I= 85, 90) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I, 95),I= 91, 96) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I, 95),I= 97,102) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 95),I=103,108) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 95),I=109,114) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 95),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=6910,6935) /4096,-1024,-124,-142,20,-160,2,-16,56, + $ -124,-124,20,20,2,2,-16,20,2,128,-16,-16,128,-160,-16,-1024,128/ C 1 T(6,7,5,1,2,3,4) - DATA (CF(I, 96),I= 1, 6) /-3.580246913580247D-01 - $ ,4.197530864197531D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00,-3.580246913580247D-01, - $ -1.358024691358025D-01/ - DATA (CF(I, 96),I= 7, 12) /1.364197530864198D+00, - $ -3.580246913580247D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 96),I= 13, 18) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 96),I= 19, 24) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 96),I= 25, 30) /1.364197530864198D+00, - $ -3.580246913580247D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00,1.364197530864198D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 96),I= 31, 36) /-4.135802469135803D-01, - $ -1.358024691358025D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00,8.641975308641975D-02, - $ -6.913580246913580D-01/ - DATA (CF(I, 96),I= 37, 42) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 96),I= 43, 48) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 96),I= 49, 54) /-4.135802469135803D-01, - $ -1.358024691358025D-01,5.864197530864197D-02, - $ -2.191358024691358D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 96),I= 55, 60) /1.558641975308642D+00 - $ ,1.586419753086420D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I, 96),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 96),I= 67, 72) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 96),I= 73, 78) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 96),I= 79, 84) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 96),I= 85, 90) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I, 96),I= 91, 96) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I, 96),I= 97,102) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 96),I=103,108) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I, 96),I=109,114) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 96),I=115,120) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ + DATA (CF(I),I=6936,6960) /4096,56,-124,-124,20,20,2,-124,-142,20 + $ ,-160,2,-16,20,2,2,-16,-16,128,-160,-16,-16,128,128,-1024/ C 1 T(6,7,5,2,1,3,4) - DATA (CF(I, 97),I= 1, 6) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 97),I= 7, 12) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 97),I= 13, 18) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 97),I= 19, 24) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 97),I= 25, 30) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 97),I= 31, 36) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 97),I= 37, 42) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 97),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 97),I= 49, 54) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 97),I= 55, 60) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 97),I= 61, 66) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 97),I= 67, 72) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 97),I= 73, 78) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 97),I= 79, 84) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 97),I= 85, 90) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 97),I= 91, 96) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 97),I= 97,102) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I, 97),I=103,108) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 97),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 97),I=115,120) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ + DATA (CF(I),I=6961,6984) /4096,-1024,-1024,128,128,1280,-1024 + $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160 + $ ,992,992,-448/ C 1 T(7,1,2,5,6,3,4) - DATA (CF(I, 98),I= 1, 6) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I, 98),I= 7, 12) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 98),I= 13, 18) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 98),I= 19, 24) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 98),I= 25, 30) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I, 98),I= 31, 36) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 98),I= 37, 42) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 98),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I, 98),I= 49, 54) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I, 98),I= 55, 60) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 98),I= 61, 66) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 98),I= 67, 72) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I, 98),I= 73, 78) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 98),I= 79, 84) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 98),I= 85, 90) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 98),I= 91, 96) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 98),I= 97,102) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I, 98),I=103,108) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 98),I=109,114) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I, 98),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ + DATA (CF(I),I=6985,7007) /4096,128,1280,-1024,128,128,-1024,-16, + $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136 + $ ,992/ C 1 T(7,1,2,6,5,3,4) - DATA (CF(I, 99),I= 1, 6) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I, 99),I= 7, 12) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I, 99),I= 13, 18) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I, 99),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I, 99),I= 25, 30) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I, 99),I= 31, 36) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I, 99),I= 37, 42) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I, 99),I= 43, 48) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I, 99),I= 49, 54) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I, 99),I= 55, 60) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I, 99),I= 61, 66) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I, 99),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I, 99),I= 73, 78) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I, 99),I= 79, 84) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I, 99),I= 85, 90) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I, 99),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I, 99),I= 97,102) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I, 99),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I, 99),I=109,114) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I, 99),I=115,120) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=7008,7029) /4096,-1024,1280,128,128,-16,1280,-160 + $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992/ C 1 T(7,1,5,2,6,3,4) - DATA (CF(I,100),I= 1, 6) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,100),I= 7, 12) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,100),I= 13, 18) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,100),I= 19, 24) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,100),I= 25, 30) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,100),I= 31, 36) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,100),I= 37, 42) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,100),I= 43, 48) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I,100),I= 49, 54) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,100),I= 55, 60) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,100),I= 61, 66) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,100),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I,100),I= 73, 78) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,100),I= 79, 84) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,100),I= 85, 90) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,100),I= 91, 96) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,100),I= 97,102) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,100),I=103,108) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I,100),I=109,114) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,100),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ + DATA (CF(I),I=7030,7050) /4096,128,-1024,-16,-160,-160,992,992, + $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160/ C 1 T(7,1,5,6,2,3,4) - DATA (CF(I,101),I= 1, 6) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,101),I= 7, 12) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,101),I= 13, 18) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,101),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,101),I= 25, 30) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,101),I= 31, 36) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,101),I= 37, 42) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,101),I= 43, 48) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,101),I= 49, 54) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,101),I= 55, 60) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,101),I= 61, 66) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,101),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I,101),I= 73, 78) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,101),I= 79, 84) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,101),I= 85, 90) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,101),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,101),I= 97,102) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I,101),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,101),I=109,114) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,101),I=115,120) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=7051,7070) /4096,-1024,-16,128,1136,992,1280,-160, + $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160/ C 1 T(7,1,6,2,5,3,4) - DATA (CF(I,102),I= 1, 6) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,102),I= 7, 12) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,102),I= 13, 18) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,102),I= 19, 24) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,102),I= 25, 30) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,102),I= 31, 36) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,102),I= 37, 42) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,102),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I,102),I= 49, 54) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,102),I= 55, 60) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,102),I= 61, 66) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,102),I= 67, 72) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,102),I= 73, 78) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,102),I= 79, 84) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,102),I= 85, 90) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,102),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I,102),I= 97,102) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I,102),I=103,108) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,102),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,102),I=115,120) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=7071,7089) /4096,-160,-16,992,-448,-160,992,-16 + $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16/ C 1 T(7,1,6,5,2,3,4) - DATA (CF(I,103),I= 1, 6) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,103),I= 7, 12) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,103),I= 13, 18) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,103),I= 19, 24) /1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,103),I= 25, 30) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,103),I= 31, 36) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,103),I= 37, 42) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,103),I= 43, 48) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,103),I= 49, 54) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,103),I= 55, 60) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,103),I= 61, 66) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,103),I= 67, 72) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I,103),I= 73, 78) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,103),I= 79, 84) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,103),I= 85, 90) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,103),I= 91, 96) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I,103),I= 97,102) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,103),I=103,108) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I,103),I=109,114) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I,103),I=115,120) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ + DATA (CF(I),I=7090,7107) /4096,-1024,-1024,128,128,1280,1280, + $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992/ C 1 T(7,2,1,5,6,3,4) - DATA (CF(I,104),I= 1, 6) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,104),I= 7, 12) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,104),I= 13, 18) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,104),I= 19, 24) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D - $ -02,-2.469135802469136D-02,3.086419753086420D-03/ - DATA (CF(I,104),I= 25, 30) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,104),I= 31, 36) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,104),I= 37, 42) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,104),I= 43, 48) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,104),I= 49, 54) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,104),I= 55, 60) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,104),I= 61, 66) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,104),I= 67, 72) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I,104),I= 73, 78) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,104),I= 79, 84) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,104),I= 85, 90) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,104),I= 91, 96) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I,104),I= 97,102) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,104),I=103,108) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I,104),I=109,114) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,104),I=115,120) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ + DATA (CF(I),I=7108,7124) /4096,128,1280,-1024,128,-160,992,-16, + $ -160,-448,992,1280,-160,128,-16,992,1136/ C 1 T(7,2,1,6,5,3,4) - DATA (CF(I,105),I= 1, 6) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,105),I= 7, 12) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,105),I= 13, 18) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,105),I= 19, 24) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,105),I= 25, 30) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,105),I= 31, 36) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,105),I= 37, 42) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,105),I= 43, 48) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,105),I= 49, 54) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,105),I= 55, 60) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,105),I= 61, 66) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,105),I= 67, 72) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,105),I= 73, 78) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,105),I= 79, 84) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,105),I= 85, 90) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,105),I= 91, 96) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I,105),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,1.975308641975309D+00, - $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D - $ +00/ - DATA (CF(I,105),I=103,108) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I,105),I=109,114) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,105),I=115,120) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ + DATA (CF(I),I=7125,7140) /4096,-1024,1280,128,128,-16,-1024,128, + $ -160,-16,992,-448,-160,-16,992,-160/ C 1 T(7,2,5,1,6,3,4) - DATA (CF(I,106),I= 1, 6) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,106),I= 7, 12) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,106),I= 13, 18) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,106),I= 19, 24) /3.086419753086420D-03 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,8.641975308641975D-02/ - DATA (CF(I,106),I= 25, 30) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,106),I= 31, 36) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,106),I= 37, 42) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,106),I= 43, 48) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,106),I= 49, 54) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,106),I= 55, 60) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,106),I= 61, 66) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,106),I= 67, 72) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,106),I= 73, 78) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,106),I= 79, 84) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,106),I= 85, 90) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,106),I= 91, 96) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,106),I= 97,102) /-2.469135802469136D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -6.913580246913580D-01/ - DATA (CF(I,106),I=103,108) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,106),I=109,114) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,106),I=115,120) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ + DATA (CF(I),I=7141,7155) /4096,128,-1024,-16,-160,128,-1024,-16 + $ ,128,1136,992,-16,128,-160,1280/ C 1 T(7,2,5,6,1,3,4) - DATA (CF(I,107),I= 1, 6) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,107),I= 7, 12) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,107),I= 13, 18) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,107),I= 19, 24) /3.086419753086420D-03, - $ -2.469135802469136D-02,-2.191358024691358D-01, - $ -1.913580246913580D-01,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,107),I= 25, 30) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,107),I= 31, 36) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,107),I= 37, 42) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,107),I= 43, 48) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,107),I= 49, 54) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,107),I= 55, 60) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,107),I= 61, 66) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,107),I= 67, 72) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I,107),I= 73, 78) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,107),I= 79, 84) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,107),I= 85, 90) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,107),I= 91, 96) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,107),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D - $ +00,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,107),I=103,108) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I,107),I=109,114) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I,107),I=115,120) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=7156,7169) /4096,-1024,992,-448,-160,-16,992,-160 + $ ,128,-16,-1024,128,-160,-16/ C 1 T(7,2,6,1,5,3,4) - DATA (CF(I,108),I= 1, 6) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,108),I= 7, 12) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,108),I= 13, 18) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,108),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02, - $ -1.913580246913580D-01/ - DATA (CF(I,108),I= 25, 30) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,108),I= 31, 36) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,108),I= 37, 42) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,108),I= 43, 48) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,108),I= 49, 54) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,108),I= 55, 60) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,108),I= 61, 66) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,108),I= 67, 72) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,108),I= 73, 78) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,108),I= 79, 84) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,108),I= 85, 90) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,108),I= 91, 96) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,108),I= 97,102) /-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,108),I=103,108) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I,108),I=109,114) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,108),I=115,120) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=7170,7182) /4096,1136,992,-16,128,-160,1280,-16, + $ -160,128,-1024,-16,128/ C 1 T(7,2,6,5,1,3,4) - DATA (CF(I,109),I= 1, 6) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,109),I= 7, 12) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,109),I= 13, 18) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,109),I= 19, 24) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,109),I= 25, 30) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,109),I= 31, 36) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,109),I= 37, 42) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,109),I= 43, 48) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I,109),I= 49, 54) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,109),I= 55, 60) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,109),I= 61, 66) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,109),I= 67, 72) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,109),I= 73, 78) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,109),I= 79, 84) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,109),I= 85, 90) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,109),I= 91, 96) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,109),I= 97,102) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,109),I=103,108) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I,109),I=109,114) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ - DATA (CF(I,109),I=115,120) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ + DATA (CF(I),I=7183,7194) /4096,-1024,-1024,128,128,1280,992,-160 + $ ,-448,992,-16,-160/ C 1 T(7,5,1,2,6,3,4) - DATA (CF(I,110),I= 1, 6) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,110),I= 7, 12) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,110),I= 13, 18) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,110),I= 19, 24) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,110),I= 25, 30) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,110),I= 31, 36) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,110),I= 37, 42) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,110),I= 43, 48) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I,110),I= 49, 54) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,110),I= 55, 60) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,110),I= 61, 66) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,110),I= 67, 72) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,110),I= 73, 78) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,110),I= 79, 84) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,110),I= 85, 90) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,110),I= 91, 96) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,110),I= 97,102) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,110),I=103,108) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,110),I=109,114) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ - DATA (CF(I,110),I=115,120) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ + DATA (CF(I),I=7195,7205) /4096,128,1280,-1024,128,-160,1280,992 + $ ,1136,128,-16/ C 1 T(7,5,1,6,2,3,4) - DATA (CF(I,111),I= 1, 6) /8.641975308641975D-02, - $ -6.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,1.530864197530864D+00, - $ -1.913580246913580D-01/ - DATA (CF(I,111),I= 7, 12) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,111),I= 13, 18) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,111),I= 19, 24) /-2.469135802469136D-01 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,3.086419753086420D-03,-1.913580246913580D-01, - $ -2.191358024691358D-01/ - DATA (CF(I,111),I= 25, 30) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D - $ +00,1.753086419753086D+00,-2.191358024691358D-01/ - DATA (CF(I,111),I= 31, 36) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,111),I= 37, 42) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,111),I= 43, 48) /-2.469135802469136D-02 - $ ,3.086419753086420D-03,1.975308641975309D-01, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,111),I= 49, 54) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,111),I= 55, 60) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,111),I= 61, 66) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,111),I= 67, 72) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,111),I= 73, 78) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,111),I= 79, 84) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,111),I= 85, 90) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,111),I= 91, 96) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,111),I= 97,102) /1.975308641975309D+00, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D - $ +00/ - DATA (CF(I,111),I=103,108) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,111),I=109,114) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ - DATA (CF(I,111),I=115,120) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ + DATA (CF(I),I=7206,7215) /4096,-1024,1280,128,-448,992,992,-160, + $ -160,-16/ C 1 T(7,5,2,1,6,3,4) - DATA (CF(I,112),I= 1, 6) /-1.358024691358025D-01 - $ ,8.641975308641975D-02,-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.913580246913580D-01, - $ -1.635802469135803D-01/ - DATA (CF(I,112),I= 7, 12) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,112),I= 13, 18) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,112),I= 19, 24) /3.086419753086420D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02,8.641975308641975D-02,-1.913580246913580D-01/ - DATA (CF(I,112),I= 25, 30) /-1.635802469135803D-01, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -4.135802469135803D-01,-2.191358024691358D-01 - $ ,5.864197530864197D-02/ - DATA (CF(I,112),I= 31, 36) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,112),I= 37, 42) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,112),I= 43, 48) /3.086419753086420D-03 - $ ,3.086419753086420D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,112),I= 49, 54) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,112),I= 55, 60) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,112),I= 61, 66) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,112),I= 67, 72) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,112),I= 73, 78) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,112),I= 79, 84) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,112),I= 85, 90) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,112),I= 91, 96) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,112),I= 97,102) /-2.469135802469136D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,112),I=103,108) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,112),I=109,114) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,112),I=115,120) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=7216,7224) /4096,128,-1024,992,1136,-160,1280,-16 + $ ,128/ C 1 T(7,5,2,6,1,3,4) - DATA (CF(I,113),I= 1, 6) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,113),I= 7, 12) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,113),I= 13, 18) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,113),I= 19, 24) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,113),I= 25, 30) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,113),I= 31, 36) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,113),I= 37, 42) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,113),I= 43, 48) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I,113),I= 49, 54) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,113),I= 55, 60) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,113),I= 61, 66) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,113),I= 67, 72) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,113),I= 73, 78) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,113),I= 79, 84) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,113),I= 85, 90) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,113),I= 91, 96) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,113),I= 97,102) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,113),I=103,108) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I,113),I=109,114) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ - DATA (CF(I,113),I=115,120) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ + DATA (CF(I),I=7225,7232) /4096,-1024,-16,128,-160,-16,-1024,128/ C 1 T(7,5,6,1,2,3,4) - DATA (CF(I,114),I= 1, 6) /-3.580246913580247D-01, - $ -1.358024691358025D-01,4.197530864197531D-01, - $ -3.580246913580247D-01,1.586419753086420D+00, - $ -1.358024691358025D-01/ - DATA (CF(I,114),I= 7, 12) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,114),I= 13, 18) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,114),I= 19, 24) /-1.913580246913580D-01 - $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D - $ -03,-1.913580246913580D-01,3.086419753086420D-02/ - DATA (CF(I,114),I= 25, 30) /1.364197530864198D+00, - $ -4.135802469135803D-01,-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.558641975308642D+00, - $ -4.135802469135803D-01/ - DATA (CF(I,114),I= 31, 36) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,114),I= 37, 42) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,114),I= 43, 48) /-2.191358024691358D-01, - $ -1.913580246913580D-01,3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,114),I= 49, 54) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,114),I= 55, 60) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,114),I= 61, 66) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,114),I= 67, 72) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,114),I= 73, 78) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,114),I= 79, 84) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,114),I= 85, 90) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,114),I= 91, 96) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,114),I= 97,102) /1.530864197530864D+00, - $ -6.913580246913580D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,1.530864197530864D+00, - $ -2.469135802469136D-01/ - DATA (CF(I,114),I=103,108) /1.753086419753086D+00 - $ ,1.530864197530864D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,114),I=109,114) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ - DATA (CF(I,114),I=115,120) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ + DATA (CF(I),I=7233,7239) /4096,-160,-16,-16,128,128,-1024/ C 1 T(7,5,6,2,1,3,4) - DATA (CF(I,115),I= 1, 6) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,115),I= 7, 12) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,115),I= 13, 18) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,115),I= 19, 24) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,115),I= 25, 30) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,115),I= 31, 36) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,115),I= 37, 42) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,115),I= 43, 48) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,115),I= 49, 54) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,115),I= 55, 60) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,115),I= 61, 66) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,115),I= 67, 72) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,115),I= 73, 78) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,115),I= 79, 84) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,115),I= 85, 90) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,115),I= 91, 96) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,115),I= 97,102) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,115),I=103,108) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,115),I=109,114) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,115),I=115,120) /1.264197530864197D+01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D - $ +00/ + DATA (CF(I),I=7240,7245) /4096,-1024,-1024,128,128,1280/ C 1 T(7,6,1,2,5,3,4) - DATA (CF(I,116),I= 1, 6) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,116),I= 7, 12) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,116),I= 13, 18) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,116),I= 19, 24) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,116),I= 25, 30) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,116),I= 31, 36) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,116),I= 37, 42) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,116),I= 43, 48) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,116),I= 49, 54) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,116),I= 55, 60) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,116),I= 61, 66) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,116),I= 67, 72) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,116),I= 73, 78) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,116),I= 79, 84) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,116),I= 85, 90) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,116),I= 91, 96) /1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,116),I= 97,102) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,116),I=103,108) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,116),I=109,114) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,116),I=115,120) /-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D - $ +00,-1.580246913580247D+00,1.975308641975309D-01/ + DATA (CF(I),I=7246,7250) /4096,128,1280,-1024,128/ C 1 T(7,6,1,5,2,3,4) - DATA (CF(I,117),I= 1, 6) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,117),I= 7, 12) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,117),I= 13, 18) /-1.913580246913580D-01 - $ ,1.530864197530864D+00,-1.635802469135803D-01 - $ ,5.864197530864197D-02,-2.469135802469136D-01 - $ ,3.086419753086420D-02/ - DATA (CF(I,117),I= 19, 24) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,117),I= 25, 30) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,117),I= 31, 36) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,117),I= 37, 42) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,117),I= 43, 48) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,117),I= 49, 54) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,117),I= 55, 60) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,117),I= 61, 66) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,117),I= 67, 72) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,117),I= 73, 78) /3.086419753086420D-02, - $ -2.469135802469136D-01,-1.913580246913580D-01, - $ -2.191358024691358D-01,-2.469135802469136D-02 - $ ,3.086419753086420D-03/ - DATA (CF(I,117),I= 79, 84) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,117),I= 85, 90) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,117),I= 91, 96) /1.975308641975309D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,117),I= 97,102) /-2.469135802469136D-01 - $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D - $ +00,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,117),I=103,108) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,117),I=109,114) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,117),I=115,120) /-1.580246913580247D+00 - $ ,1.975308641975309D-01,1.264197530864197D+01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01/ + DATA (CF(I),I=7251,7254) /4096,-1024,1280,128/ C 1 T(7,6,2,1,5,3,4) - DATA (CF(I,118),I= 1, 6) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,118),I= 7, 12) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,118),I= 13, 18) /1.586419753086420D+00, - $ -1.913580246913580D-01,-1.358024691358025D-01, - $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D - $ -01/ - DATA (CF(I,118),I= 19, 24) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,118),I= 25, 30) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,118),I= 31, 36) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,118),I= 37, 42) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,118),I= 43, 48) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,118),I= 49, 54) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,118),I= 55, 60) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,118),I= 61, 66) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,118),I= 67, 72) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,118),I= 73, 78) /-1.913580246913580D-01 - $ ,3.086419753086420D-02,8.641975308641975D-02, - $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D - $ -02/ - DATA (CF(I,118),I= 79, 84) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,118),I= 85, 90) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,118),I= 91, 96) /-2.469135802469136D-02, - $ -2.469135802469136D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,118),I= 97,102) /1.530864197530864D+00, - $ -2.469135802469136D-01,-6.913580246913580D-01 - $ ,1.530864197530864D+00,-2.469135802469136D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,118),I=103,108) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,118),I=109,114) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,118),I=115,120) /1.975308641975309D-01 - $ ,1.975308641975309D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01,1.975308641975309D-01, - $ -1.580246913580247D+00/ + DATA (CF(I),I=7255,7257) /4096,128,-1024/ C 1 T(7,6,2,5,1,3,4) - DATA (CF(I,119),I= 1, 6) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,119),I= 7, 12) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,119),I= 13, 18) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,119),I= 19, 24) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,119),I= 25, 30) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,119),I= 31, 36) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,119),I= 37, 42) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,119),I= 43, 48) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,119),I= 49, 54) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,119),I= 55, 60) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,119),I= 61, 66) /3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D - $ -02,1.975308641975309D+00,-2.469135802469136D-01/ - DATA (CF(I,119),I= 67, 72) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,119),I= 73, 78) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,119),I= 79, 84) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,119),I= 85, 90) /3.086419753086420D-03, - $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D - $ -03,1.975308641975309D-01,-2.469135802469136D-02/ - DATA (CF(I,119),I= 91, 96) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,119),I= 97,102) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,119),I=103,108) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,119),I=109,114) /-2.469135802469136D-02 - $ ,1.975308641975309D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02,-1.580246913580247D+00 - $ ,1.975308641975309D-01/ - DATA (CF(I,119),I=115,120) /1.975308641975309D-01, - $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D - $ -01,1.264197530864197D+01,-1.580246913580247D+00/ + DATA (CF(I),I=7258,7259) /4096,-1024/ C 1 T(7,6,5,1,2,3,4) - DATA (CF(I,120),I= 1, 6) /4.197530864197531D-01, - $ -3.580246913580247D-01,-3.580246913580247D-01, - $ -1.358024691358025D-01,-1.358024691358025D-01 - $ ,1.586419753086420D+00/ - DATA (CF(I,120),I= 7, 12) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,-1.358024691358025D-01 - $ ,8.641975308641975D-02,-1.635802469135803D-01, - $ -1.913580246913580D-01/ - DATA (CF(I,120),I= 13, 18) /-1.358024691358025D-01, - $ -1.635802469135803D-01,1.586419753086420D+00, - $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D - $ -02/ - DATA (CF(I,120),I= 19, 24) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,120),I= 25, 30) /-3.580246913580247D-01 - $ ,1.364197530864198D+00,1.364197530864198D+00, - $ -4.135802469135803D-01,-4.135802469135803D-01 - $ ,1.558641975308642D+00/ - DATA (CF(I,120),I= 31, 36) /-1.358024691358025D-01, - $ -4.135802469135803D-01,8.641975308641975D-02, - $ -6.913580246913580D-01,-1.913580246913580D-01 - $ ,1.530864197530864D+00/ - DATA (CF(I,120),I= 37, 42) /-1.635802469135803D-01 - $ ,5.864197530864197D-02,-1.913580246913580D-01 - $ ,1.530864197530864D+00,3.086419753086420D-02, - $ -2.469135802469136D-01/ - DATA (CF(I,120),I= 43, 48) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,120),I= 49, 54) /-1.358024691358025D-01, - $ -4.135802469135803D-01,-1.635802469135803D-01, - $ -1.913580246913580D-01,5.864197530864197D-02, - $ -2.191358024691358D-01/ - DATA (CF(I,120),I= 55, 60) /1.586419753086420D+00 - $ ,1.558641975308642D+00,-1.913580246913580D-01 - $ ,1.530864197530864D+00,-2.191358024691358D-01 - $ ,1.753086419753086D+00/ - DATA (CF(I,120),I= 61, 66) /3.086419753086420D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02, - $ -2.469135802469136D-01,-2.469135802469136D-01 - $ ,1.975308641975309D+00/ - DATA (CF(I,120),I= 67, 72) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,120),I= 73, 78) /8.641975308641975D-02, - $ -1.913580246913580D-01,-1.913580246913580D-01 - $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D - $ -03/ - DATA (CF(I,120),I= 79, 84) /-1.913580246913580D-01, - $ -2.191358024691358D-01,3.086419753086420D-02, - $ -2.469135802469136D-01,3.086419753086420D-03, - $ -2.469135802469136D-02/ - DATA (CF(I,120),I= 85, 90) /3.086419753086420D-02 - $ ,3.086419753086420D-03,3.086419753086420D-03, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,120),I= 91, 96) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,120),I= 97,102) /-6.913580246913580D-01 - $ ,1.530864197530864D+00,1.530864197530864D+00, - $ -2.469135802469136D-01,-2.469135802469136D-01, - $ -2.469135802469136D-02/ - DATA (CF(I,120),I=103,108) /1.530864197530864D+00 - $ ,1.753086419753086D+00,-2.469135802469136D-01 - $ ,1.975308641975309D+00,-2.469135802469136D-02 - $ ,1.975308641975309D-01/ - DATA (CF(I,120),I=109,114) /-2.469135802469136D-01, - $ -2.469135802469136D-02,-2.469135802469136D-02 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00/ - DATA (CF(I,120),I=115,120) /1.975308641975309D+00 - $ ,1.975308641975309D-01,1.975308641975309D-01, - $ -1.580246913580247D+00,-1.580246913580247D+00 - $ ,1.264197530864197D+01/ + DATA (CF(I),I=7260,7260) /4096/ C 1 T(7,6,5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -18811,10 +10175,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -18823,6 +10189,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/madevent b/epochX/cudacpp/gg_ttggg.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/madevent +++ b/epochX/cudacpp/gg_ttggg.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h index 53dd560ed6..c30f753dcb 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -963,7 +963,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -976,7 +976,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -990,7 +990,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1003,7 +1003,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1017,7 +1017,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -1030,7 +1030,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1053,7 +1053,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1066,7 +1066,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1091,7 +1091,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1104,7 +1104,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1116,7 +1116,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1129,7 +1129,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1148,7 +1148,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1161,7 +1161,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1180,7 +1180,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1193,7 +1193,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1211,7 +1211,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -1226,7 +1226,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] ); @@ -1241,7 +1241,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1256,7 +1256,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1276,7 +1276,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -1291,7 +1291,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1306,7 +1306,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -1321,7 +1321,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1341,7 +1341,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1356,7 +1356,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1371,7 +1371,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1386,7 +1386,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc index 47a3a011b8..fd5642f3e3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index 76066c7bb1..f4b086fc96 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -310,7 +310,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -320,12 +320,12 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; GC_12s_sv = couplings_sv.GC_12; diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 37d3314a5d..a9c9e37bd0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005965471267700195  +DEBUG: model prefixing takes 0.0055277347564697266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.863 s +1 processes with 1240 diagrams generated in 1.896 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.535 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.539 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.348 s +ALOHA: aloha creates 5 routines in 0.343 s VVV1 VVV1 FFV1 @@ -190,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.348 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m12.948s -user 0m12.781s -sys 0m0.107s +real 0m12.885s +user 0m12.736s +sys 0m0.086s Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index 07099839d3..6b4b8dc8ce 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 120; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,31903 +279,2613 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 121; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 1240 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - vxxxxx( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - VVV1P0_1( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] ); - VVV1P0_1( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 1 - VVV1_0( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 2 OF 1240 *** - - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 2 - VVV1_0( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 3 OF 1240 *** - - // Wavefunction(s) for diagram number 3 - // (none) - - // Amplitude(s) for diagram number 3 - VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 4 OF 1240 *** - - // Wavefunction(s) for diagram number 4 - VVV1P0_1( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] ); - VVV1P0_1( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] ); - - // Amplitude(s) for diagram number 4 - VVV1_0( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 5 OF 1240 *** - - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 6 OF 1240 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 7 OF 1240 *** - - // Wavefunction(s) for diagram number 7 - VVV1P0_1( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 7 - VVV1_0( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 8 OF 1240 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - VVV1_0( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 9 OF 1240 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 10 OF 1240 *** - - // Wavefunction(s) for diagram number 10 - VVVV1P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); - VVVV3P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] ); - VVVV4P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); - - // Amplitude(s) for diagram number 10 - VVV1_0( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 11 OF 1240 *** - - // Wavefunction(s) for diagram number 11 - VVVV1P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] ); - VVVV3P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] ); - VVVV4P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] ); - - // Amplitude(s) for diagram number 11 - VVV1_0( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 12 OF 1240 *** - - // Wavefunction(s) for diagram number 12 - VVVV1P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] ); - VVVV4P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 12 - VVV1_0( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 13 OF 1240 *** - - // Wavefunction(s) for diagram number 13 - VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 13 - VVVV1_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 14 OF 1240 *** - - // Wavefunction(s) for diagram number 14 - VVV1P0_1( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] ); - - // Amplitude(s) for diagram number 14 - VVV1_0( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 15 OF 1240 *** - - // Wavefunction(s) for diagram number 15 - VVV1P0_1( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] ); - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 16 OF 1240 *** - - // Wavefunction(s) for diagram number 16 - // (none) - - // Amplitude(s) for diagram number 16 - VVV1_0( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 17 OF 1240 *** - - // Wavefunction(s) for diagram number 17 - VVV1P0_1( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] ); - - // Amplitude(s) for diagram number 17 - VVVV1_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - VVVV3_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVVV4_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 18 OF 1240 *** - - // Wavefunction(s) for diagram number 18 - // (none) - - // Amplitude(s) for diagram number 18 - VVV1_0( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 19 OF 1240 *** - - // Wavefunction(s) for diagram number 19 - VVV1P0_1( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] ); - - // Amplitude(s) for diagram number 19 - VVV1_0( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 20 OF 1240 *** - - // Wavefunction(s) for diagram number 20 - // (none) - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 21 OF 1240 *** - - // Wavefunction(s) for diagram number 21 - VVV1P0_1( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] ); - - // Amplitude(s) for diagram number 21 - VVVV1_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 22 OF 1240 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - VVV1_0( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 23 OF 1240 *** - - // Wavefunction(s) for diagram number 23 - // (none) - - // Amplitude(s) for diagram number 23 - VVV1_0( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 24 OF 1240 *** - - // Wavefunction(s) for diagram number 24 - VVV1P0_1( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] ); - - // Amplitude(s) for diagram number 24 - VVV1_0( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 25 OF 1240 *** - - // Wavefunction(s) for diagram number 25 - VVVV1P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] ); - VVVV3P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] ); - VVVV4P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] ); - - // Amplitude(s) for diagram number 25 - VVV1_0( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVV1_0( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 26 OF 1240 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] ); - FFV1_2( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); - FFV1_1( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 27 OF 1240 *** - // Wavefunction(s) for diagram number 27 - FFV1_1( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] ); - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 28 OF 1240 *** - - // Wavefunction(s) for diagram number 28 - FFV1P0_3( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] ); - - // Amplitude(s) for diagram number 28 - VVV1_0( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 29 OF 1240 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 30 OF 1240 *** - - // Wavefunction(s) for diagram number 30 - // (none) - - // Amplitude(s) for diagram number 30 - VVV1_0( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 31 OF 1240 *** - - // Wavefunction(s) for diagram number 31 - // (none) - - // Amplitude(s) for diagram number 31 - FFV1_0( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - - // *** DIAGRAM 32 OF 1240 *** - - // Wavefunction(s) for diagram number 32 - // (none) - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 33 OF 1240 *** - - // Wavefunction(s) for diagram number 33 - FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] ); - FFV1_1( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] ); - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 34 OF 1240 *** - - // Wavefunction(s) for diagram number 34 - FFV1_2( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] ); - - // Amplitude(s) for diagram number 34 - FFV1_0( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 35 OF 1240 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - FFV1_0( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - - // *** DIAGRAM 36 OF 1240 *** - - // Wavefunction(s) for diagram number 36 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] ); - - // Amplitude(s) for diagram number 36 - FFV1_0( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 37 OF 1240 *** - - // Wavefunction(s) for diagram number 37 - FFV1_2( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] ); - - // Amplitude(s) for diagram number 37 - FFV1_0( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 38 OF 1240 *** - - // Wavefunction(s) for diagram number 38 - // (none) - - // Amplitude(s) for diagram number 38 - FFV1_0( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - - // *** DIAGRAM 39 OF 1240 *** - - // Wavefunction(s) for diagram number 39 - // (none) - - // Amplitude(s) for diagram number 39 - FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - - // *** DIAGRAM 40 OF 1240 *** - - // Wavefunction(s) for diagram number 40 - // (none) - - // Amplitude(s) for diagram number 40 - FFV1_0( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 41 OF 1240 *** - - // Wavefunction(s) for diagram number 41 - // (none) - - // Amplitude(s) for diagram number 41 - FFV1_0( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 42 OF 1240 *** - - // Wavefunction(s) for diagram number 42 - FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] ); - FFV1_1( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] ); - - // Amplitude(s) for diagram number 42 - FFV1_0( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 43 OF 1240 *** - - // Wavefunction(s) for diagram number 43 - FFV1_1( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] ); - - // Amplitude(s) for diagram number 43 - FFV1_0( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 44 OF 1240 *** - - // Wavefunction(s) for diagram number 44 - FFV1P0_3( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] ); - - // Amplitude(s) for diagram number 44 - VVV1_0( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 45 OF 1240 *** - - // Wavefunction(s) for diagram number 45 - // (none) - - // Amplitude(s) for diagram number 45 - FFV1_0( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 46 OF 1240 *** - - // Wavefunction(s) for diagram number 46 - // (none) - - // Amplitude(s) for diagram number 46 - VVV1_0( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 47 OF 1240 *** - - // Wavefunction(s) for diagram number 47 - // (none) - - // Amplitude(s) for diagram number 47 - FFV1_0( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - - // *** DIAGRAM 48 OF 1240 *** - - // Wavefunction(s) for diagram number 48 - // (none) - - // Amplitude(s) for diagram number 48 - FFV1_0( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 49 OF 1240 *** - - // Wavefunction(s) for diagram number 49 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] ); - FFV1_1( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] ); - - // Amplitude(s) for diagram number 49 - FFV1_0( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 50 OF 1240 *** - - // Wavefunction(s) for diagram number 50 - FFV1_2( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] ); - - // Amplitude(s) for diagram number 50 - FFV1_0( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 51 OF 1240 *** - - // Wavefunction(s) for diagram number 51 - // (none) - - // Amplitude(s) for diagram number 51 - FFV1_0( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - - // *** DIAGRAM 52 OF 1240 *** - - // Wavefunction(s) for diagram number 52 - // (none) - - // Amplitude(s) for diagram number 52 - FFV1_0( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 53 OF 1240 *** - - // Wavefunction(s) for diagram number 53 - // (none) - - // Amplitude(s) for diagram number 53 - FFV1_0( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 54 OF 1240 *** - - // Wavefunction(s) for diagram number 54 - // (none) - - // Amplitude(s) for diagram number 54 - FFV1_0( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 55 OF 1240 *** - - // Wavefunction(s) for diagram number 55 - // (none) - - // Amplitude(s) for diagram number 55 - FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - - // *** DIAGRAM 56 OF 1240 *** - - // Wavefunction(s) for diagram number 56 - // (none) - - // Amplitude(s) for diagram number 56 - FFV1_0( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 57 OF 1240 *** - - // Wavefunction(s) for diagram number 57 - // (none) - - // Amplitude(s) for diagram number 57 - FFV1_0( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 58 OF 1240 *** - - // Wavefunction(s) for diagram number 58 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] ); - FFV1_1( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] ); - - // Amplitude(s) for diagram number 58 - FFV1_0( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 59 OF 1240 *** - - // Wavefunction(s) for diagram number 59 - FFV1_1( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] ); - - // Amplitude(s) for diagram number 59 - FFV1_0( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 60 OF 1240 *** - - // Wavefunction(s) for diagram number 60 - FFV1P0_3( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] ); - - // Amplitude(s) for diagram number 60 - VVV1_0( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 61 OF 1240 *** - - // Wavefunction(s) for diagram number 61 - // (none) - - // Amplitude(s) for diagram number 61 - FFV1_0( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 62 OF 1240 *** - - // Wavefunction(s) for diagram number 62 - // (none) - - // Amplitude(s) for diagram number 62 - VVV1_0( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 63 OF 1240 *** - - // Wavefunction(s) for diagram number 63 - // (none) - - // Amplitude(s) for diagram number 63 - FFV1_0( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 64 OF 1240 *** - - // Wavefunction(s) for diagram number 64 - // (none) - - // Amplitude(s) for diagram number 64 - FFV1_0( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 65 OF 1240 *** - - // Wavefunction(s) for diagram number 65 - FFV1_1( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); - - // Amplitude(s) for diagram number 65 - FFV1_0( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 66 OF 1240 *** - - // Wavefunction(s) for diagram number 66 - // (none) - - // Amplitude(s) for diagram number 66 - FFV1_0( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 67 OF 1240 *** - - // Wavefunction(s) for diagram number 67 - // (none) - - // Amplitude(s) for diagram number 67 - FFV1_0( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 68 OF 1240 *** - - // Wavefunction(s) for diagram number 68 - // (none) - - // Amplitude(s) for diagram number 68 - FFV1_0( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 69 OF 1240 *** - - // Wavefunction(s) for diagram number 69 - // (none) - - // Amplitude(s) for diagram number 69 - FFV1_0( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 70 OF 1240 *** - - // Wavefunction(s) for diagram number 70 - // (none) - - // Amplitude(s) for diagram number 70 - FFV1_0( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 71 OF 1240 *** - - // Wavefunction(s) for diagram number 71 - // (none) - - // Amplitude(s) for diagram number 71 - FFV1_0( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 72 OF 1240 *** - - // Wavefunction(s) for diagram number 72 - // (none) - - // Amplitude(s) for diagram number 72 - FFV1_0( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 73 OF 1240 *** - - // Wavefunction(s) for diagram number 73 - // (none) - - // Amplitude(s) for diagram number 73 - FFV1_0( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 74 OF 1240 *** - - // Wavefunction(s) for diagram number 74 - FFV1_1( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); - FFV1_2( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); - - // Amplitude(s) for diagram number 74 - FFV1_0( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 75 OF 1240 *** - - // Wavefunction(s) for diagram number 75 - FFV1_2( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] ); - - // Amplitude(s) for diagram number 75 - FFV1_0( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 76 OF 1240 *** - - // Wavefunction(s) for diagram number 76 - FFV1P0_3( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] ); - - // Amplitude(s) for diagram number 76 - VVV1_0( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 77 OF 1240 *** - - // Wavefunction(s) for diagram number 77 - // (none) - - // Amplitude(s) for diagram number 77 - FFV1_0( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - - // *** DIAGRAM 78 OF 1240 *** - - // Wavefunction(s) for diagram number 78 - // (none) - - // Amplitude(s) for diagram number 78 - VVV1_0( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 79 OF 1240 *** - - // Wavefunction(s) for diagram number 79 - // (none) - - // Amplitude(s) for diagram number 79 - FFV1_0( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 80 OF 1240 *** - - // Wavefunction(s) for diagram number 80 - // (none) - - // Amplitude(s) for diagram number 80 - FFV1_0( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 81 OF 1240 *** - - // Wavefunction(s) for diagram number 81 - // (none) - - // Amplitude(s) for diagram number 81 - FFV1_0( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - - // *** DIAGRAM 82 OF 1240 *** - - // Wavefunction(s) for diagram number 82 - // (none) - - // Amplitude(s) for diagram number 82 - FFV1_0( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 83 OF 1240 *** - - // Wavefunction(s) for diagram number 83 - // (none) - - // Amplitude(s) for diagram number 83 - FFV1_0( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 84 OF 1240 *** - - // Wavefunction(s) for diagram number 84 - FFV1_2( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] ); - - // Amplitude(s) for diagram number 84 - FFV1_0( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 85 OF 1240 *** - - // Wavefunction(s) for diagram number 85 - FFV1_2( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] ); - - // Amplitude(s) for diagram number 85 - FFV1_0( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 86 OF 1240 *** - - // Wavefunction(s) for diagram number 86 - FFV1P0_3( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 86 - VVV1_0( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 87 OF 1240 *** - - // Wavefunction(s) for diagram number 87 - // (none) - - // Amplitude(s) for diagram number 87 - FFV1_0( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - - // *** DIAGRAM 88 OF 1240 *** - - // Wavefunction(s) for diagram number 88 - // (none) - - // Amplitude(s) for diagram number 88 - VVV1_0( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 89 OF 1240 *** - - // Wavefunction(s) for diagram number 89 - // (none) - - // Amplitude(s) for diagram number 89 - FFV1_0( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - - // *** DIAGRAM 90 OF 1240 *** - - // Wavefunction(s) for diagram number 90 - // (none) - - // Amplitude(s) for diagram number 90 - FFV1_0( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 91 OF 1240 *** - - // Wavefunction(s) for diagram number 91 - // (none) - - // Amplitude(s) for diagram number 91 - FFV1_0( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - - // *** DIAGRAM 92 OF 1240 *** - - // Wavefunction(s) for diagram number 92 - // (none) - - // Amplitude(s) for diagram number 92 - FFV1_0( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 93 OF 1240 *** - - // Wavefunction(s) for diagram number 93 - // (none) - - // Amplitude(s) for diagram number 93 - FFV1_0( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 94 OF 1240 *** - - // Wavefunction(s) for diagram number 94 - FFV1_2( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] ); - - // Amplitude(s) for diagram number 94 - FFV1_0( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 95 OF 1240 *** - - // Wavefunction(s) for diagram number 95 - FFV1_2( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] ); - - // Amplitude(s) for diagram number 95 - FFV1_0( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 96 OF 1240 *** - - // Wavefunction(s) for diagram number 96 - FFV1P0_3( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] ); - - // Amplitude(s) for diagram number 96 - VVV1_0( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 97 OF 1240 *** - - // Wavefunction(s) for diagram number 97 - // (none) - - // Amplitude(s) for diagram number 97 - FFV1_0( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - - // *** DIAGRAM 98 OF 1240 *** - - // Wavefunction(s) for diagram number 98 - // (none) - - // Amplitude(s) for diagram number 98 - VVV1_0( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 99 OF 1240 *** - - // Wavefunction(s) for diagram number 99 - // (none) - - // Amplitude(s) for diagram number 99 - FFV1_0( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - - // *** DIAGRAM 100 OF 1240 *** - - // Wavefunction(s) for diagram number 100 - // (none) - - // Amplitude(s) for diagram number 100 - FFV1_0( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 101 OF 1240 *** - - // Wavefunction(s) for diagram number 101 - // (none) - - // Amplitude(s) for diagram number 101 - FFV1_0( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - - // *** DIAGRAM 102 OF 1240 *** - - // Wavefunction(s) for diagram number 102 - // (none) - - // Amplitude(s) for diagram number 102 - FFV1_0( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 103 OF 1240 *** - - // Wavefunction(s) for diagram number 103 - // (none) - - // Amplitude(s) for diagram number 103 - FFV1_0( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 104 OF 1240 *** - - // Wavefunction(s) for diagram number 104 - FFV1_2( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] ); - - // Amplitude(s) for diagram number 104 - FFV1_0( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - - // *** DIAGRAM 105 OF 1240 *** - - // Wavefunction(s) for diagram number 105 - VVV1P0_1( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] ); - - // Amplitude(s) for diagram number 105 - FFV1_0( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 106 OF 1240 *** - - // Wavefunction(s) for diagram number 106 - FFV1_1( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); - - // Amplitude(s) for diagram number 106 - FFV1_0( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - - // *** DIAGRAM 107 OF 1240 *** - - // Wavefunction(s) for diagram number 107 - // (none) - - // Amplitude(s) for diagram number 107 - FFV1_0( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 108 OF 1240 *** - - // Wavefunction(s) for diagram number 108 - // (none) - - // Amplitude(s) for diagram number 108 - FFV1_0( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 109 OF 1240 *** - - // Wavefunction(s) for diagram number 109 - // (none) - - // Amplitude(s) for diagram number 109 - FFV1_0( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 110 OF 1240 *** - - // Wavefunction(s) for diagram number 110 - FFV1_2( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 110 - FFV1_0( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - - // *** DIAGRAM 111 OF 1240 *** - - // Wavefunction(s) for diagram number 111 - VVV1P0_1( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] ); - - // Amplitude(s) for diagram number 111 - FFV1_0( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 112 OF 1240 *** - - // Wavefunction(s) for diagram number 112 - FFV1_1( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); - - // Amplitude(s) for diagram number 112 - FFV1_0( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 113 OF 1240 *** - - // Wavefunction(s) for diagram number 113 - // (none) - - // Amplitude(s) for diagram number 113 - FFV1_0( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 114 OF 1240 *** - - // Wavefunction(s) for diagram number 114 - // (none) - - // Amplitude(s) for diagram number 114 - FFV1_0( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 115 OF 1240 *** - - // Wavefunction(s) for diagram number 115 - // (none) - - // Amplitude(s) for diagram number 115 - FFV1_0( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 116 OF 1240 *** - - // Wavefunction(s) for diagram number 116 - FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 116 - FFV1_0( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - - // *** DIAGRAM 117 OF 1240 *** - - // Wavefunction(s) for diagram number 117 - VVV1P0_1( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] ); - - // Amplitude(s) for diagram number 117 - FFV1_0( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 118 OF 1240 *** - - // Wavefunction(s) for diagram number 118 - FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] ); - - // Amplitude(s) for diagram number 118 - FFV1_0( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 119 OF 1240 *** - - // Wavefunction(s) for diagram number 119 - // (none) - - // Amplitude(s) for diagram number 119 - FFV1_0( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 120 OF 1240 *** - - // Wavefunction(s) for diagram number 120 - // (none) - - // Amplitude(s) for diagram number 120 - FFV1_0( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 121 OF 1240 *** - - // Wavefunction(s) for diagram number 121 - // (none) - - // Amplitude(s) for diagram number 121 - FFV1_0( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 122 OF 1240 *** - - // Wavefunction(s) for diagram number 122 - // (none) - - // Amplitude(s) for diagram number 122 - FFV1_0( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 123 OF 1240 *** - - // Wavefunction(s) for diagram number 123 - // (none) - - // Amplitude(s) for diagram number 123 - FFV1_0( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 124 OF 1240 *** - - // Wavefunction(s) for diagram number 124 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); - FFV1_1( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); - - // Amplitude(s) for diagram number 124 - FFV1_0( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 125 OF 1240 *** - - // Wavefunction(s) for diagram number 125 - FFV1_2( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 125 - FFV1_0( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - - // *** DIAGRAM 126 OF 1240 *** - - // Wavefunction(s) for diagram number 126 - FFV1_1( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] ); - FFV1_2( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] ); - - // Amplitude(s) for diagram number 126 - FFV1_0( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 127 OF 1240 *** - - // Wavefunction(s) for diagram number 127 - // (none) - - // Amplitude(s) for diagram number 127 - FFV1_0( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] -= amp_sv[0]; - - // *** DIAGRAM 128 OF 1240 *** - - // Wavefunction(s) for diagram number 128 - FFV1_1( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] ); - - // Amplitude(s) for diagram number 128 - FFV1_0( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 129 OF 1240 *** - - // Wavefunction(s) for diagram number 129 - // (none) - - // Amplitude(s) for diagram number 129 - FFV1_0( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 130 OF 1240 *** - - // Wavefunction(s) for diagram number 130 - FFV1P0_3( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] ); - - // Amplitude(s) for diagram number 130 - VVV1_0( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 131 OF 1240 *** - - // Wavefunction(s) for diagram number 131 - FFV1_1( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); - - // Amplitude(s) for diagram number 131 - FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 132 OF 1240 *** - - // Wavefunction(s) for diagram number 132 - // (none) - - // Amplitude(s) for diagram number 132 - FFV1_0( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 133 OF 1240 *** - - // Wavefunction(s) for diagram number 133 - // (none) - - // Amplitude(s) for diagram number 133 - VVV1_0( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 134 OF 1240 *** - - // Wavefunction(s) for diagram number 134 - FFV1_1( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); - - // Amplitude(s) for diagram number 134 - FFV1_0( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 135 OF 1240 *** - - // Wavefunction(s) for diagram number 135 - // (none) - - // Amplitude(s) for diagram number 135 - FFV1_0( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 136 OF 1240 *** - - // Wavefunction(s) for diagram number 136 - // (none) - - // Amplitude(s) for diagram number 136 - VVV1_0( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 137 OF 1240 *** - - // Wavefunction(s) for diagram number 137 - // (none) - - // Amplitude(s) for diagram number 137 - FFV1_0( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 138 OF 1240 *** - - // Wavefunction(s) for diagram number 138 - FFV1_1( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] ); - - // Amplitude(s) for diagram number 138 - FFV1_0( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 139 OF 1240 *** - - // Wavefunction(s) for diagram number 139 - // (none) - - // Amplitude(s) for diagram number 139 - FFV1_0( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 140 OF 1240 *** - - // Wavefunction(s) for diagram number 140 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] ); - FFV1P0_3( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] ); - VVV1P0_1( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] ); - - // Amplitude(s) for diagram number 140 - VVV1_0( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 141 OF 1240 *** - - // Wavefunction(s) for diagram number 141 - VVV1P0_1( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] ); - - // Amplitude(s) for diagram number 141 - VVV1_0( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 142 OF 1240 *** - - // Wavefunction(s) for diagram number 142 - // (none) - - // Amplitude(s) for diagram number 142 - VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 143 OF 1240 *** - - // Wavefunction(s) for diagram number 143 - FFV1_2( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] ); - - // Amplitude(s) for diagram number 143 - FFV1_0( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 144 OF 1240 *** - - // Wavefunction(s) for diagram number 144 - // (none) - - // Amplitude(s) for diagram number 144 - FFV1_0( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 145 OF 1240 *** - - // Wavefunction(s) for diagram number 145 - // (none) - - // Amplitude(s) for diagram number 145 - FFV1_0( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 146 OF 1240 *** - - // Wavefunction(s) for diagram number 146 - // (none) - - // Amplitude(s) for diagram number 146 - FFV1_0( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 147 OF 1240 *** - - // Wavefunction(s) for diagram number 147 - FFV1_1( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] ); - - // Amplitude(s) for diagram number 147 - FFV1_0( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 148 OF 1240 *** - - // Wavefunction(s) for diagram number 148 - FFV1P0_3( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] ); - - // Amplitude(s) for diagram number 148 - VVV1_0( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 149 OF 1240 *** - - // Wavefunction(s) for diagram number 149 - // (none) - - // Amplitude(s) for diagram number 149 - FFV1_0( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 150 OF 1240 *** - - // Wavefunction(s) for diagram number 150 - // (none) - - // Amplitude(s) for diagram number 150 - FFV1_0( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 151 OF 1240 *** - - // Wavefunction(s) for diagram number 151 - FFV1P0_3( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] ); - - // Amplitude(s) for diagram number 151 - VVV1_0( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 152 OF 1240 *** - - // Wavefunction(s) for diagram number 152 - // (none) - - // Amplitude(s) for diagram number 152 - FFV1_0( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 153 OF 1240 *** - - // Wavefunction(s) for diagram number 153 - // (none) - - // Amplitude(s) for diagram number 153 - FFV1_0( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - - // *** DIAGRAM 154 OF 1240 *** - - // Wavefunction(s) for diagram number 154 - // (none) - - // Amplitude(s) for diagram number 154 - VVV1_0( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 155 OF 1240 *** - - // Wavefunction(s) for diagram number 155 - // (none) - - // Amplitude(s) for diagram number 155 - FFV1_0( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 156 OF 1240 *** - - // Wavefunction(s) for diagram number 156 - VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] ); - VVV1P0_1( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] ); - - // Amplitude(s) for diagram number 156 - VVV1_0( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 157 OF 1240 *** - - // Wavefunction(s) for diagram number 157 - VVV1P0_1( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] ); - - // Amplitude(s) for diagram number 157 - VVV1_0( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 158 OF 1240 *** - - // Wavefunction(s) for diagram number 158 - // (none) - - // Amplitude(s) for diagram number 158 - VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 159 OF 1240 *** - - // Wavefunction(s) for diagram number 159 - FFV1_2( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); - - // Amplitude(s) for diagram number 159 - FFV1_0( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 160 OF 1240 *** - - // Wavefunction(s) for diagram number 160 - // (none) - - // Amplitude(s) for diagram number 160 - FFV1_0( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 161 OF 1240 *** - - // Wavefunction(s) for diagram number 161 - // (none) - - // Amplitude(s) for diagram number 161 - FFV1_0( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 162 OF 1240 *** - - // Wavefunction(s) for diagram number 162 - // (none) - - // Amplitude(s) for diagram number 162 - FFV1_0( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 163 OF 1240 *** - - // Wavefunction(s) for diagram number 163 - FFV1_1( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] ); - - // Amplitude(s) for diagram number 163 - FFV1_0( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 164 OF 1240 *** - - // Wavefunction(s) for diagram number 164 - FFV1P0_3( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] ); - - // Amplitude(s) for diagram number 164 - VVV1_0( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 165 OF 1240 *** - - // Wavefunction(s) for diagram number 165 - // (none) - - // Amplitude(s) for diagram number 165 - FFV1_0( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 166 OF 1240 *** - - // Wavefunction(s) for diagram number 166 - // (none) - - // Amplitude(s) for diagram number 166 - FFV1_0( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 167 OF 1240 *** - - // Wavefunction(s) for diagram number 167 - // (none) - - // Amplitude(s) for diagram number 167 - VVV1_0( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 168 OF 1240 *** - - // Wavefunction(s) for diagram number 168 - // (none) - - // Amplitude(s) for diagram number 168 - FFV1_0( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 169 OF 1240 *** - - // Wavefunction(s) for diagram number 169 - // (none) - - // Amplitude(s) for diagram number 169 - FFV1_0( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - - // *** DIAGRAM 170 OF 1240 *** - - // Wavefunction(s) for diagram number 170 - // (none) - - // Amplitude(s) for diagram number 170 - VVV1_0( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 171 OF 1240 *** - - // Wavefunction(s) for diagram number 171 - // (none) - - // Amplitude(s) for diagram number 171 - FFV1_0( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - - // *** DIAGRAM 172 OF 1240 *** - - // Wavefunction(s) for diagram number 172 - VVV1P0_1( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] ); - VVV1P0_1( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] ); - - // Amplitude(s) for diagram number 172 - VVV1_0( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 173 OF 1240 *** - - // Wavefunction(s) for diagram number 173 - VVV1P0_1( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] ); - - // Amplitude(s) for diagram number 173 - VVV1_0( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 174 OF 1240 *** - - // Wavefunction(s) for diagram number 174 - // (none) - - // Amplitude(s) for diagram number 174 - VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 175 OF 1240 *** - - // Wavefunction(s) for diagram number 175 - FFV1_2( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] ); - - // Amplitude(s) for diagram number 175 - FFV1_0( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 176 OF 1240 *** - - // Wavefunction(s) for diagram number 176 - // (none) - - // Amplitude(s) for diagram number 176 - FFV1_0( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 177 OF 1240 *** - - // Wavefunction(s) for diagram number 177 - // (none) - - // Amplitude(s) for diagram number 177 - FFV1_0( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 178 OF 1240 *** - - // Wavefunction(s) for diagram number 178 - // (none) - - // Amplitude(s) for diagram number 178 - FFV1_0( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 179 OF 1240 *** - - // Wavefunction(s) for diagram number 179 - FFV1_1( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); - - // Amplitude(s) for diagram number 179 - FFV1_0( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 180 OF 1240 *** - - // Wavefunction(s) for diagram number 180 - // (none) - - // Amplitude(s) for diagram number 180 - VVV1_0( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 181 OF 1240 *** - - // Wavefunction(s) for diagram number 181 - // (none) - - // Amplitude(s) for diagram number 181 - FFV1_0( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 182 OF 1240 *** - - // Wavefunction(s) for diagram number 182 - // (none) - - // Amplitude(s) for diagram number 182 - FFV1_0( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 183 OF 1240 *** - - // Wavefunction(s) for diagram number 183 - // (none) - - // Amplitude(s) for diagram number 183 - VVV1_0( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 184 OF 1240 *** - - // Wavefunction(s) for diagram number 184 - // (none) - - // Amplitude(s) for diagram number 184 - FFV1_0( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 185 OF 1240 *** - - // Wavefunction(s) for diagram number 185 - // (none) - - // Amplitude(s) for diagram number 185 - FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - - // *** DIAGRAM 186 OF 1240 *** - - // Wavefunction(s) for diagram number 186 - // (none) - - // Amplitude(s) for diagram number 186 - VVV1_0( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 187 OF 1240 *** - - // Wavefunction(s) for diagram number 187 - // (none) - - // Amplitude(s) for diagram number 187 - FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - - // *** DIAGRAM 188 OF 1240 *** - - // Wavefunction(s) for diagram number 188 - FFV1_1( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); - - // Amplitude(s) for diagram number 188 - FFV1_0( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 189 OF 1240 *** - - // Wavefunction(s) for diagram number 189 - // (none) - - // Amplitude(s) for diagram number 189 - FFV1_0( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 190 OF 1240 *** - - // Wavefunction(s) for diagram number 190 - FFV1_2( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] ); - - // Amplitude(s) for diagram number 190 - FFV1_0( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 191 OF 1240 *** - - // Wavefunction(s) for diagram number 191 - // (none) - - // Amplitude(s) for diagram number 191 - FFV1_0( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - - // *** DIAGRAM 192 OF 1240 *** - - // Wavefunction(s) for diagram number 192 - // (none) - - // Amplitude(s) for diagram number 192 - FFV1_0( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 193 OF 1240 *** - - // Wavefunction(s) for diagram number 193 - // (none) - - // Amplitude(s) for diagram number 193 - FFV1_0( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 194 OF 1240 *** - - // Wavefunction(s) for diagram number 194 - // (none) - - // Amplitude(s) for diagram number 194 - FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 195 OF 1240 *** - - // Wavefunction(s) for diagram number 195 - // (none) - - // Amplitude(s) for diagram number 195 - VVV1_0( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 196 OF 1240 *** - - // Wavefunction(s) for diagram number 196 - // (none) - - // Amplitude(s) for diagram number 196 - FFV1_0( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 197 OF 1240 *** - - // Wavefunction(s) for diagram number 197 - // (none) - - // Amplitude(s) for diagram number 197 - FFV1_0( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 198 OF 1240 *** - - // Wavefunction(s) for diagram number 198 - // (none) - - // Amplitude(s) for diagram number 198 - FFV1_0( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 199 OF 1240 *** - - // Wavefunction(s) for diagram number 199 - FFV1_2( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] ); - - // Amplitude(s) for diagram number 199 - FFV1_0( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 200 OF 1240 *** - - // Wavefunction(s) for diagram number 200 - // (none) - - // Amplitude(s) for diagram number 200 - FFV1_0( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - - // *** DIAGRAM 201 OF 1240 *** - - // Wavefunction(s) for diagram number 201 - // (none) - - // Amplitude(s) for diagram number 201 - FFV1_0( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 202 OF 1240 *** - - // Wavefunction(s) for diagram number 202 - // (none) - - // Amplitude(s) for diagram number 202 - FFV1_0( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 203 OF 1240 *** - - // Wavefunction(s) for diagram number 203 - // (none) - - // Amplitude(s) for diagram number 203 - FFV1_0( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 204 OF 1240 *** - - // Wavefunction(s) for diagram number 204 - // (none) - - // Amplitude(s) for diagram number 204 - VVV1_0( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 205 OF 1240 *** - - // Wavefunction(s) for diagram number 205 - // (none) - - // Amplitude(s) for diagram number 205 - FFV1_0( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 206 OF 1240 *** - - // Wavefunction(s) for diagram number 206 - // (none) - - // Amplitude(s) for diagram number 206 - FFV1_0( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 207 OF 1240 *** - - // Wavefunction(s) for diagram number 207 - // (none) - - // Amplitude(s) for diagram number 207 - FFV1_0( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 208 OF 1240 *** - - // Wavefunction(s) for diagram number 208 - FFV1_2( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); - - // Amplitude(s) for diagram number 208 - FFV1_0( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - - // *** DIAGRAM 209 OF 1240 *** - - // Wavefunction(s) for diagram number 209 - // (none) - - // Amplitude(s) for diagram number 209 - FFV1_0( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - - // *** DIAGRAM 210 OF 1240 *** - - // Wavefunction(s) for diagram number 210 - // (none) - - // Amplitude(s) for diagram number 210 - FFV1_0( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 211 OF 1240 *** - - // Wavefunction(s) for diagram number 211 - // (none) - - // Amplitude(s) for diagram number 211 - FFV1_0( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 212 OF 1240 *** - - // Wavefunction(s) for diagram number 212 - // (none) - - // Amplitude(s) for diagram number 212 - FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 213 OF 1240 *** - - // Wavefunction(s) for diagram number 213 - // (none) - - // Amplitude(s) for diagram number 213 - VVV1_0( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 214 OF 1240 *** - - // Wavefunction(s) for diagram number 214 - // (none) - - // Amplitude(s) for diagram number 214 - FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 215 OF 1240 *** - - // Wavefunction(s) for diagram number 215 - // (none) - - // Amplitude(s) for diagram number 215 - FFV1_0( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 216 OF 1240 *** - - // Wavefunction(s) for diagram number 216 - // (none) - - // Amplitude(s) for diagram number 216 - FFV1_0( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 217 OF 1240 *** - - // Wavefunction(s) for diagram number 217 - VVV1P0_1( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] ); - - // Amplitude(s) for diagram number 217 - VVV1_0( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 218 OF 1240 *** - - // Wavefunction(s) for diagram number 218 - // (none) - - // Amplitude(s) for diagram number 218 - VVV1_0( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 219 OF 1240 *** - - // Wavefunction(s) for diagram number 219 - // (none) - - // Amplitude(s) for diagram number 219 - VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 220 OF 1240 *** - - // Wavefunction(s) for diagram number 220 - // (none) - - // Amplitude(s) for diagram number 220 - FFV1_0( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 221 OF 1240 *** - - // Wavefunction(s) for diagram number 221 - // (none) - - // Amplitude(s) for diagram number 221 - FFV1_0( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 222 OF 1240 *** - - // Wavefunction(s) for diagram number 222 - // (none) - - // Amplitude(s) for diagram number 222 - FFV1_0( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 223 OF 1240 *** - - // Wavefunction(s) for diagram number 223 - // (none) - - // Amplitude(s) for diagram number 223 - FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 224 OF 1240 *** - - // Wavefunction(s) for diagram number 224 - VVV1P0_1( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] ); - - // Amplitude(s) for diagram number 224 - VVV1_0( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 225 OF 1240 *** - - // Wavefunction(s) for diagram number 225 - // (none) - - // Amplitude(s) for diagram number 225 - VVV1_0( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 226 OF 1240 *** - - // Wavefunction(s) for diagram number 226 - // (none) - - // Amplitude(s) for diagram number 226 - VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 227 OF 1240 *** - - // Wavefunction(s) for diagram number 227 - // (none) - - // Amplitude(s) for diagram number 227 - FFV1_0( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 228 OF 1240 *** - - // Wavefunction(s) for diagram number 228 - // (none) - - // Amplitude(s) for diagram number 228 - FFV1_0( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 229 OF 1240 *** - - // Wavefunction(s) for diagram number 229 - // (none) - - // Amplitude(s) for diagram number 229 - FFV1_0( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 230 OF 1240 *** - - // Wavefunction(s) for diagram number 230 - // (none) - - // Amplitude(s) for diagram number 230 - FFV1_0( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 231 OF 1240 *** - - // Wavefunction(s) for diagram number 231 - VVV1P0_1( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] ); - - // Amplitude(s) for diagram number 231 - VVV1_0( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 232 OF 1240 *** - - // Wavefunction(s) for diagram number 232 - // (none) - - // Amplitude(s) for diagram number 232 - VVV1_0( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 233 OF 1240 *** - - // Wavefunction(s) for diagram number 233 - // (none) - - // Amplitude(s) for diagram number 233 - VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 234 OF 1240 *** - - // Wavefunction(s) for diagram number 234 - // (none) - - // Amplitude(s) for diagram number 234 - FFV1_0( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 235 OF 1240 *** - - // Wavefunction(s) for diagram number 235 - // (none) - - // Amplitude(s) for diagram number 235 - FFV1_0( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 236 OF 1240 *** - - // Wavefunction(s) for diagram number 236 - VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] ); - VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] ); - VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] ); - - // Amplitude(s) for diagram number 236 - VVV1_0( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 237 OF 1240 *** - - // Wavefunction(s) for diagram number 237 - // (none) - - // Amplitude(s) for diagram number 237 - FFV1_0( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 238 OF 1240 *** - - // Wavefunction(s) for diagram number 238 - // (none) - - // Amplitude(s) for diagram number 238 - FFV1_0( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 239 OF 1240 *** - - // Wavefunction(s) for diagram number 239 - VVVV1P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] ); - VVVV3P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] ); - VVVV4P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] ); - - // Amplitude(s) for diagram number 239 - VVV1_0( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 240 OF 1240 *** - - // Wavefunction(s) for diagram number 240 - // (none) - - // Amplitude(s) for diagram number 240 - FFV1_0( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 241 OF 1240 *** - - // Wavefunction(s) for diagram number 241 - // (none) - - // Amplitude(s) for diagram number 241 - FFV1_0( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 242 OF 1240 *** - - // Wavefunction(s) for diagram number 242 - VVVV1P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] ); - VVVV3P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] ); - VVVV4P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] ); - - // Amplitude(s) for diagram number 242 - VVV1_0( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 243 OF 1240 *** - - // Wavefunction(s) for diagram number 243 - // (none) - - // Amplitude(s) for diagram number 243 - FFV1_0( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 244 OF 1240 *** - - // Wavefunction(s) for diagram number 244 - // (none) - - // Amplitude(s) for diagram number 244 - FFV1_0( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 245 OF 1240 *** - - // Wavefunction(s) for diagram number 245 - // (none) - - // Amplitude(s) for diagram number 245 - FFV1_0( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 246 OF 1240 *** - - // Wavefunction(s) for diagram number 246 - // (none) - - // Amplitude(s) for diagram number 246 - VVV1_0( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 247 OF 1240 *** - - // Wavefunction(s) for diagram number 247 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); - FFV1_2( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); - FFV1_1( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 247 - FFV1_0( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] -= amp_sv[0]; - - // *** DIAGRAM 248 OF 1240 *** - - // Wavefunction(s) for diagram number 248 - FFV1_1( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] ); - - // Amplitude(s) for diagram number 248 - FFV1_0( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[47] -= amp_sv[0]; - - // *** DIAGRAM 249 OF 1240 *** - - // Wavefunction(s) for diagram number 249 - FFV1_2( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] ); - FFV1_1( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] ); - - // Amplitude(s) for diagram number 249 - FFV1_0( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= amp_sv[0]; - - // *** DIAGRAM 250 OF 1240 *** - - // Wavefunction(s) for diagram number 250 - // (none) - - // Amplitude(s) for diagram number 250 - FFV1_0( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] -= amp_sv[0]; - - // *** DIAGRAM 251 OF 1240 *** - - // Wavefunction(s) for diagram number 251 - FFV1_2( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] ); - - // Amplitude(s) for diagram number 251 - FFV1_0( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= amp_sv[0]; - - // *** DIAGRAM 252 OF 1240 *** - - // Wavefunction(s) for diagram number 252 - // (none) - - // Amplitude(s) for diagram number 252 - FFV1_0( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] -= amp_sv[0]; - - // *** DIAGRAM 253 OF 1240 *** - - // Wavefunction(s) for diagram number 253 - FFV1P0_3( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] ); - - // Amplitude(s) for diagram number 253 - VVV1_0( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 254 OF 1240 *** - - // Wavefunction(s) for diagram number 254 - FFV1_2( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); - - // Amplitude(s) for diagram number 254 - FFV1_0( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 255 OF 1240 *** - - // Wavefunction(s) for diagram number 255 - // (none) - - // Amplitude(s) for diagram number 255 - FFV1_0( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 256 OF 1240 *** - - // Wavefunction(s) for diagram number 256 - // (none) - - // Amplitude(s) for diagram number 256 - VVV1_0( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - - // *** DIAGRAM 257 OF 1240 *** - - // Wavefunction(s) for diagram number 257 - FFV1_2( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] ); - - // Amplitude(s) for diagram number 257 - FFV1_0( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 258 OF 1240 *** - - // Wavefunction(s) for diagram number 258 - // (none) - - // Amplitude(s) for diagram number 258 - FFV1_0( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 259 OF 1240 *** - - // Wavefunction(s) for diagram number 259 - // (none) - - // Amplitude(s) for diagram number 259 - VVV1_0( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 260 OF 1240 *** - - // Wavefunction(s) for diagram number 260 - // (none) - - // Amplitude(s) for diagram number 260 - FFV1_0( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 261 OF 1240 *** - - // Wavefunction(s) for diagram number 261 - FFV1_2( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] ); - - // Amplitude(s) for diagram number 261 - FFV1_0( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 262 OF 1240 *** - - // Wavefunction(s) for diagram number 262 - // (none) - - // Amplitude(s) for diagram number 262 - FFV1_0( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - - // *** DIAGRAM 263 OF 1240 *** - - // Wavefunction(s) for diagram number 263 - FFV1P0_3( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] ); - - // Amplitude(s) for diagram number 263 - VVV1_0( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 264 OF 1240 *** - - // Wavefunction(s) for diagram number 264 - // (none) - - // Amplitude(s) for diagram number 264 - VVV1_0( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 265 OF 1240 *** - - // Wavefunction(s) for diagram number 265 - // (none) - - // Amplitude(s) for diagram number 265 - VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 266 OF 1240 *** - - // Wavefunction(s) for diagram number 266 - FFV1_1( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] ); - - // Amplitude(s) for diagram number 266 - FFV1_0( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 267 OF 1240 *** - - // Wavefunction(s) for diagram number 267 - // (none) - - // Amplitude(s) for diagram number 267 - FFV1_0( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 268 OF 1240 *** - - // Wavefunction(s) for diagram number 268 - // (none) - - // Amplitude(s) for diagram number 268 - FFV1_0( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 269 OF 1240 *** - - // Wavefunction(s) for diagram number 269 - // (none) - - // Amplitude(s) for diagram number 269 - FFV1_0( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - - // *** DIAGRAM 270 OF 1240 *** - - // Wavefunction(s) for diagram number 270 - FFV1_2( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] ); - - // Amplitude(s) for diagram number 270 - FFV1_0( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 271 OF 1240 *** - - // Wavefunction(s) for diagram number 271 - FFV1P0_3( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] ); - - // Amplitude(s) for diagram number 271 - VVV1_0( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 272 OF 1240 *** - - // Wavefunction(s) for diagram number 272 - // (none) - - // Amplitude(s) for diagram number 272 - FFV1_0( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 273 OF 1240 *** - - // Wavefunction(s) for diagram number 273 - // (none) - - // Amplitude(s) for diagram number 273 - FFV1_0( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 274 OF 1240 *** - - // Wavefunction(s) for diagram number 274 - FFV1P0_3( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] ); - - // Amplitude(s) for diagram number 274 - VVV1_0( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 275 OF 1240 *** - - // Wavefunction(s) for diagram number 275 - // (none) - - // Amplitude(s) for diagram number 275 - FFV1_0( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 276 OF 1240 *** - - // Wavefunction(s) for diagram number 276 - // (none) - - // Amplitude(s) for diagram number 276 - FFV1_0( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 277 OF 1240 *** - - // Wavefunction(s) for diagram number 277 - // (none) - - // Amplitude(s) for diagram number 277 - VVV1_0( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 278 OF 1240 *** - - // Wavefunction(s) for diagram number 278 - // (none) - - // Amplitude(s) for diagram number 278 - FFV1_0( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - - // *** DIAGRAM 279 OF 1240 *** - - // Wavefunction(s) for diagram number 279 - // (none) - - // Amplitude(s) for diagram number 279 - VVV1_0( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 280 OF 1240 *** - - // Wavefunction(s) for diagram number 280 - // (none) - - // Amplitude(s) for diagram number 280 - VVV1_0( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 281 OF 1240 *** - - // Wavefunction(s) for diagram number 281 - // (none) - - // Amplitude(s) for diagram number 281 - VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 282 OF 1240 *** - - // Wavefunction(s) for diagram number 282 - FFV1_1( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] ); - - // Amplitude(s) for diagram number 282 - FFV1_0( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 283 OF 1240 *** - - // Wavefunction(s) for diagram number 283 - // (none) - - // Amplitude(s) for diagram number 283 - FFV1_0( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 284 OF 1240 *** - - // Wavefunction(s) for diagram number 284 - // (none) - - // Amplitude(s) for diagram number 284 - FFV1_0( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 285 OF 1240 *** - - // Wavefunction(s) for diagram number 285 - // (none) - - // Amplitude(s) for diagram number 285 - FFV1_0( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - - // *** DIAGRAM 286 OF 1240 *** - - // Wavefunction(s) for diagram number 286 - FFV1_2( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] ); - - // Amplitude(s) for diagram number 286 - FFV1_0( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 287 OF 1240 *** - - // Wavefunction(s) for diagram number 287 - FFV1P0_3( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] ); - - // Amplitude(s) for diagram number 287 - VVV1_0( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 288 OF 1240 *** - - // Wavefunction(s) for diagram number 288 - // (none) - - // Amplitude(s) for diagram number 288 - FFV1_0( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 289 OF 1240 *** - - // Wavefunction(s) for diagram number 289 - // (none) - - // Amplitude(s) for diagram number 289 - FFV1_0( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 290 OF 1240 *** - - // Wavefunction(s) for diagram number 290 - // (none) - - // Amplitude(s) for diagram number 290 - VVV1_0( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 291 OF 1240 *** - - // Wavefunction(s) for diagram number 291 - // (none) - - // Amplitude(s) for diagram number 291 - FFV1_0( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 292 OF 1240 *** - - // Wavefunction(s) for diagram number 292 - // (none) - - // Amplitude(s) for diagram number 292 - FFV1_0( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 293 OF 1240 *** - - // Wavefunction(s) for diagram number 293 - // (none) - - // Amplitude(s) for diagram number 293 - VVV1_0( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 294 OF 1240 *** - - // Wavefunction(s) for diagram number 294 - // (none) - - // Amplitude(s) for diagram number 294 - FFV1_0( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - - // *** DIAGRAM 295 OF 1240 *** - - // Wavefunction(s) for diagram number 295 - // (none) - - // Amplitude(s) for diagram number 295 - VVV1_0( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 296 OF 1240 *** - - // Wavefunction(s) for diagram number 296 - // (none) - - // Amplitude(s) for diagram number 296 - VVV1_0( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 297 OF 1240 *** - - // Wavefunction(s) for diagram number 297 - // (none) - - // Amplitude(s) for diagram number 297 - VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 298 OF 1240 *** - - // Wavefunction(s) for diagram number 298 - FFV1_1( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] ); - - // Amplitude(s) for diagram number 298 - FFV1_0( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 299 OF 1240 *** - - // Wavefunction(s) for diagram number 299 - // (none) - - // Amplitude(s) for diagram number 299 - FFV1_0( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[47] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 300 OF 1240 *** - - // Wavefunction(s) for diagram number 300 - // (none) - - // Amplitude(s) for diagram number 300 - FFV1_0( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 301 OF 1240 *** - - // Wavefunction(s) for diagram number 301 - // (none) - - // Amplitude(s) for diagram number 301 - FFV1_0( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - - // *** DIAGRAM 302 OF 1240 *** - - // Wavefunction(s) for diagram number 302 - FFV1_2( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 302 - FFV1_0( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 303 OF 1240 *** - - // Wavefunction(s) for diagram number 303 - // (none) - - // Amplitude(s) for diagram number 303 - VVV1_0( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - - // *** DIAGRAM 304 OF 1240 *** - - // Wavefunction(s) for diagram number 304 - // (none) - - // Amplitude(s) for diagram number 304 - FFV1_0( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 305 OF 1240 *** - - // Wavefunction(s) for diagram number 305 - // (none) - - // Amplitude(s) for diagram number 305 - FFV1_0( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 306 OF 1240 *** - - // Wavefunction(s) for diagram number 306 - // (none) - - // Amplitude(s) for diagram number 306 - VVV1_0( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - - // *** DIAGRAM 307 OF 1240 *** - - // Wavefunction(s) for diagram number 307 - // (none) - - // Amplitude(s) for diagram number 307 - FFV1_0( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 308 OF 1240 *** - - // Wavefunction(s) for diagram number 308 - // (none) - - // Amplitude(s) for diagram number 308 - FFV1_0( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - - // *** DIAGRAM 309 OF 1240 *** - - // Wavefunction(s) for diagram number 309 - // (none) - - // Amplitude(s) for diagram number 309 - VVV1_0( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 310 OF 1240 *** - - // Wavefunction(s) for diagram number 310 - // (none) - - // Amplitude(s) for diagram number 310 - FFV1_0( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 311 OF 1240 *** - - // Wavefunction(s) for diagram number 311 - FFV1_2( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 311 - FFV1_0( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[65] -= amp_sv[0]; - - // *** DIAGRAM 312 OF 1240 *** - - // Wavefunction(s) for diagram number 312 - // (none) - - // Amplitude(s) for diagram number 312 - FFV1_0( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[71] -= amp_sv[0]; - - // *** DIAGRAM 313 OF 1240 *** - - // Wavefunction(s) for diagram number 313 - FFV1_1( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] ); - - // Amplitude(s) for diagram number 313 - FFV1_0( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[59] -= amp_sv[0]; - - // *** DIAGRAM 314 OF 1240 *** - - // Wavefunction(s) for diagram number 314 - // (none) - - // Amplitude(s) for diagram number 314 - FFV1_0( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[69] -= amp_sv[0]; - - // *** DIAGRAM 315 OF 1240 *** - - // Wavefunction(s) for diagram number 315 - // (none) - - // Amplitude(s) for diagram number 315 - FFV1_0( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[57] -= amp_sv[0]; - - // *** DIAGRAM 316 OF 1240 *** - - // Wavefunction(s) for diagram number 316 - // (none) - - // Amplitude(s) for diagram number 316 - FFV1_0( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[63] -= amp_sv[0]; - - // *** DIAGRAM 317 OF 1240 *** - - // Wavefunction(s) for diagram number 317 - // (none) - - // Amplitude(s) for diagram number 317 - FFV1_0( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 318 OF 1240 *** - - // Wavefunction(s) for diagram number 318 - // (none) - - // Amplitude(s) for diagram number 318 - VVV1_0( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 319 OF 1240 *** - - // Wavefunction(s) for diagram number 319 - // (none) - - // Amplitude(s) for diagram number 319 - FFV1_0( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 320 OF 1240 *** - - // Wavefunction(s) for diagram number 320 - // (none) - - // Amplitude(s) for diagram number 320 - FFV1_0( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[89] -= amp_sv[0]; - - // *** DIAGRAM 321 OF 1240 *** - - // Wavefunction(s) for diagram number 321 - // (none) - - // Amplitude(s) for diagram number 321 - FFV1_0( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[95] -= amp_sv[0]; - - // *** DIAGRAM 322 OF 1240 *** - - // Wavefunction(s) for diagram number 322 - FFV1_1( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] ); - - // Amplitude(s) for diagram number 322 - FFV1_0( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[83] -= amp_sv[0]; - - // *** DIAGRAM 323 OF 1240 *** - - // Wavefunction(s) for diagram number 323 - // (none) - - // Amplitude(s) for diagram number 323 - FFV1_0( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[93] -= amp_sv[0]; - - // *** DIAGRAM 324 OF 1240 *** - - // Wavefunction(s) for diagram number 324 - // (none) - - // Amplitude(s) for diagram number 324 - FFV1_0( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[81] -= amp_sv[0]; - - // *** DIAGRAM 325 OF 1240 *** - - // Wavefunction(s) for diagram number 325 - // (none) - - // Amplitude(s) for diagram number 325 - FFV1_0( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[87] -= amp_sv[0]; - - // *** DIAGRAM 326 OF 1240 *** - - // Wavefunction(s) for diagram number 326 - // (none) - - // Amplitude(s) for diagram number 326 - FFV1_0( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 327 OF 1240 *** - - // Wavefunction(s) for diagram number 327 - // (none) - - // Amplitude(s) for diagram number 327 - VVV1_0( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 328 OF 1240 *** - - // Wavefunction(s) for diagram number 328 - // (none) - - // Amplitude(s) for diagram number 328 - FFV1_0( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 329 OF 1240 *** - - // Wavefunction(s) for diagram number 329 - // (none) - - // Amplitude(s) for diagram number 329 - FFV1_0( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 330 OF 1240 *** - - // Wavefunction(s) for diagram number 330 - // (none) - - // Amplitude(s) for diagram number 330 - FFV1_0( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 331 OF 1240 *** - - // Wavefunction(s) for diagram number 331 - FFV1_1( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] ); - - // Amplitude(s) for diagram number 331 - FFV1_0( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 332 OF 1240 *** - - // Wavefunction(s) for diagram number 332 - // (none) - - // Amplitude(s) for diagram number 332 - FFV1_0( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 333 OF 1240 *** - - // Wavefunction(s) for diagram number 333 - // (none) - - // Amplitude(s) for diagram number 333 - FFV1_0( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[105] -= amp_sv[0]; - - // *** DIAGRAM 334 OF 1240 *** - - // Wavefunction(s) for diagram number 334 - // (none) - - // Amplitude(s) for diagram number 334 - FFV1_0( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 335 OF 1240 *** - - // Wavefunction(s) for diagram number 335 - // (none) - - // Amplitude(s) for diagram number 335 - FFV1_0( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 336 OF 1240 *** - - // Wavefunction(s) for diagram number 336 - // (none) - - // Amplitude(s) for diagram number 336 - VVV1_0( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 337 OF 1240 *** - - // Wavefunction(s) for diagram number 337 - // (none) - - // Amplitude(s) for diagram number 337 - FFV1_0( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 338 OF 1240 *** - - // Wavefunction(s) for diagram number 338 - // (none) - - // Amplitude(s) for diagram number 338 - FFV1_0( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 339 OF 1240 *** - - // Wavefunction(s) for diagram number 339 - // (none) - - // Amplitude(s) for diagram number 339 - FFV1_0( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[65] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 340 OF 1240 *** - - // Wavefunction(s) for diagram number 340 - // (none) - - // Amplitude(s) for diagram number 340 - VVV1_0( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 341 OF 1240 *** - - // Wavefunction(s) for diagram number 341 - // (none) - - // Amplitude(s) for diagram number 341 - VVV1_0( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 342 OF 1240 *** - - // Wavefunction(s) for diagram number 342 - // (none) - - // Amplitude(s) for diagram number 342 - VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 343 OF 1240 *** - - // Wavefunction(s) for diagram number 343 - // (none) - - // Amplitude(s) for diagram number 343 - FFV1_0( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - - // *** DIAGRAM 344 OF 1240 *** - - // Wavefunction(s) for diagram number 344 - // (none) - - // Amplitude(s) for diagram number 344 - FFV1_0( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 345 OF 1240 *** - - // Wavefunction(s) for diagram number 345 - // (none) - - // Amplitude(s) for diagram number 345 - FFV1_0( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 346 OF 1240 *** - - // Wavefunction(s) for diagram number 346 - // (none) - - // Amplitude(s) for diagram number 346 - FFV1_0( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[71] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 347 OF 1240 *** - - // Wavefunction(s) for diagram number 347 - // (none) - - // Amplitude(s) for diagram number 347 - VVV1_0( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 348 OF 1240 *** - - // Wavefunction(s) for diagram number 348 - // (none) - - // Amplitude(s) for diagram number 348 - VVV1_0( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 349 OF 1240 *** - - // Wavefunction(s) for diagram number 349 - // (none) - - // Amplitude(s) for diagram number 349 - VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 350 OF 1240 *** - - // Wavefunction(s) for diagram number 350 - // (none) - - // Amplitude(s) for diagram number 350 - FFV1_0( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 351 OF 1240 *** - - // Wavefunction(s) for diagram number 351 - // (none) - - // Amplitude(s) for diagram number 351 - FFV1_0( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 352 OF 1240 *** - - // Wavefunction(s) for diagram number 352 - // (none) - - // Amplitude(s) for diagram number 352 - FFV1_0( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 353 OF 1240 *** - - // Wavefunction(s) for diagram number 353 - // (none) - - // Amplitude(s) for diagram number 353 - FFV1_0( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 354 OF 1240 *** - - // Wavefunction(s) for diagram number 354 - // (none) - - // Amplitude(s) for diagram number 354 - VVV1_0( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 355 OF 1240 *** - - // Wavefunction(s) for diagram number 355 - // (none) - - // Amplitude(s) for diagram number 355 - VVV1_0( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 356 OF 1240 *** - - // Wavefunction(s) for diagram number 356 - // (none) - - // Amplitude(s) for diagram number 356 - VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 357 OF 1240 *** - - // Wavefunction(s) for diagram number 357 - // (none) - - // Amplitude(s) for diagram number 357 - FFV1_0( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 358 OF 1240 *** - - // Wavefunction(s) for diagram number 358 - // (none) - - // Amplitude(s) for diagram number 358 - FFV1_0( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 359 OF 1240 *** - - // Wavefunction(s) for diagram number 359 - // (none) - - // Amplitude(s) for diagram number 359 - VVV1_0( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 360 OF 1240 *** - - // Wavefunction(s) for diagram number 360 - // (none) - - // Amplitude(s) for diagram number 360 - FFV1_0( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - FFV1_0( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[39] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - FFV1_0( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - - // *** DIAGRAM 361 OF 1240 *** - - // Wavefunction(s) for diagram number 361 - // (none) - - // Amplitude(s) for diagram number 361 - FFV1_0( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 362 OF 1240 *** - - // Wavefunction(s) for diagram number 362 - // (none) - - // Amplitude(s) for diagram number 362 - VVV1_0( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 363 OF 1240 *** - - // Wavefunction(s) for diagram number 363 - // (none) - - // Amplitude(s) for diagram number 363 - FFV1_0( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - FFV1_0( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[45] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - FFV1_0( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 364 OF 1240 *** - - // Wavefunction(s) for diagram number 364 - // (none) - - // Amplitude(s) for diagram number 364 - FFV1_0( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - - // *** DIAGRAM 365 OF 1240 *** - - // Wavefunction(s) for diagram number 365 - // (none) - - // Amplitude(s) for diagram number 365 - VVV1_0( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 366 OF 1240 *** - - // Wavefunction(s) for diagram number 366 - // (none) - - // Amplitude(s) for diagram number 366 - FFV1_0( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - FFV1_0( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[47] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - FFV1_0( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[41] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 367 OF 1240 *** - - // Wavefunction(s) for diagram number 367 - // (none) - - // Amplitude(s) for diagram number 367 - FFV1_0( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - - // *** DIAGRAM 368 OF 1240 *** - - // Wavefunction(s) for diagram number 368 - // (none) - - // Amplitude(s) for diagram number 368 - FFV1_0( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 369 OF 1240 *** - - // Wavefunction(s) for diagram number 369 - // (none) - - // Amplitude(s) for diagram number 369 - VVV1_0( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 370 OF 1240 *** - - // Wavefunction(s) for diagram number 370 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] ); - FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 370 - FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 371 OF 1240 *** - - // Wavefunction(s) for diagram number 371 - // (none) - - // Amplitude(s) for diagram number 371 - FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 372 OF 1240 *** - - // Wavefunction(s) for diagram number 372 - VVV1P0_1( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] ); - FFV1P0_3( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] ); - - // Amplitude(s) for diagram number 372 - VVV1_0( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 373 OF 1240 *** - - // Wavefunction(s) for diagram number 373 - // (none) - - // Amplitude(s) for diagram number 373 - FFV1_0( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 374 OF 1240 *** - - // Wavefunction(s) for diagram number 374 - VVV1P0_1( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 374 - VVV1_0( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 375 OF 1240 *** - - // Wavefunction(s) for diagram number 375 - // (none) - - // Amplitude(s) for diagram number 375 - FFV1_0( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - - // *** DIAGRAM 376 OF 1240 *** - - // Wavefunction(s) for diagram number 376 - VVVV1P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] ); - VVVV3P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] ); - VVVV4P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] ); - - // Amplitude(s) for diagram number 376 - FFV1_0( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 377 OF 1240 *** - - // Wavefunction(s) for diagram number 377 - FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] ); - - // Amplitude(s) for diagram number 377 - FFV1_0( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 378 OF 1240 *** - - // Wavefunction(s) for diagram number 378 - FFV1_2( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - - // Amplitude(s) for diagram number 378 - FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 379 OF 1240 *** - - // Wavefunction(s) for diagram number 379 - // (none) - - // Amplitude(s) for diagram number 379 - FFV1_0( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - - // *** DIAGRAM 380 OF 1240 *** - - // Wavefunction(s) for diagram number 380 - // (none) - - // Amplitude(s) for diagram number 380 - FFV1_0( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 381 OF 1240 *** - - // Wavefunction(s) for diagram number 381 - FFV1_2( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] ); - - // Amplitude(s) for diagram number 381 - FFV1_0( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 382 OF 1240 *** - - // Wavefunction(s) for diagram number 382 - // (none) - - // Amplitude(s) for diagram number 382 - FFV1_0( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - - // *** DIAGRAM 383 OF 1240 *** - - // Wavefunction(s) for diagram number 383 - // (none) - - // Amplitude(s) for diagram number 383 - FFV1_0( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - - // *** DIAGRAM 384 OF 1240 *** - - // Wavefunction(s) for diagram number 384 - // (none) - - // Amplitude(s) for diagram number 384 - FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 385 OF 1240 *** - - // Wavefunction(s) for diagram number 385 - VVV1P0_1( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] ); - - // Amplitude(s) for diagram number 385 - FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 386 OF 1240 *** - - // Wavefunction(s) for diagram number 386 - FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); - - // Amplitude(s) for diagram number 386 - FFV1_0( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 387 OF 1240 *** - - // Wavefunction(s) for diagram number 387 - // (none) - - // Amplitude(s) for diagram number 387 - FFV1_0( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 388 OF 1240 *** - - // Wavefunction(s) for diagram number 388 - FFV1P0_3( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] ); - - // Amplitude(s) for diagram number 388 - VVV1_0( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 389 OF 1240 *** - - // Wavefunction(s) for diagram number 389 - // (none) - - // Amplitude(s) for diagram number 389 - FFV1_0( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - - // *** DIAGRAM 390 OF 1240 *** - - // Wavefunction(s) for diagram number 390 - // (none) - - // Amplitude(s) for diagram number 390 - VVV1_0( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 391 OF 1240 *** - - // Wavefunction(s) for diagram number 391 - // (none) - - // Amplitude(s) for diagram number 391 - FFV1_0( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 392 OF 1240 *** - - // Wavefunction(s) for diagram number 392 - // (none) - - // Amplitude(s) for diagram number 392 - FFV1_0( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 393 OF 1240 *** - - // Wavefunction(s) for diagram number 393 - FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); - - // Amplitude(s) for diagram number 393 - FFV1_0( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 394 OF 1240 *** - - // Wavefunction(s) for diagram number 394 - FFV1_1( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] ); - - // Amplitude(s) for diagram number 394 - FFV1_0( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 395 OF 1240 *** - - // Wavefunction(s) for diagram number 395 - // (none) - - // Amplitude(s) for diagram number 395 - FFV1_0( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - - // *** DIAGRAM 396 OF 1240 *** - - // Wavefunction(s) for diagram number 396 - // (none) - - // Amplitude(s) for diagram number 396 - FFV1_0( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 397 OF 1240 *** - - // Wavefunction(s) for diagram number 397 - FFV1_1( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); - - // Amplitude(s) for diagram number 397 - FFV1_0( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 398 OF 1240 *** - - // Wavefunction(s) for diagram number 398 - // (none) - - // Amplitude(s) for diagram number 398 - FFV1_0( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 399 OF 1240 *** - - // Wavefunction(s) for diagram number 399 - // (none) - - // Amplitude(s) for diagram number 399 - FFV1_0( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 400 OF 1240 *** - - // Wavefunction(s) for diagram number 400 - // (none) - - // Amplitude(s) for diagram number 400 - FFV1_0( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - - // *** DIAGRAM 401 OF 1240 *** - - // Wavefunction(s) for diagram number 401 - // (none) - - // Amplitude(s) for diagram number 401 - FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 402 OF 1240 *** - - // Wavefunction(s) for diagram number 402 - // (none) - - // Amplitude(s) for diagram number 402 - FFV1_0( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - - // *** DIAGRAM 403 OF 1240 *** - - // Wavefunction(s) for diagram number 403 - // (none) - - // Amplitude(s) for diagram number 403 - FFV1_0( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 404 OF 1240 *** - - // Wavefunction(s) for diagram number 404 - // (none) - - // Amplitude(s) for diagram number 404 - FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - - // *** DIAGRAM 405 OF 1240 *** - - // Wavefunction(s) for diagram number 405 - // (none) - - // Amplitude(s) for diagram number 405 - FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 406 OF 1240 *** - - // Wavefunction(s) for diagram number 406 - // (none) - - // Amplitude(s) for diagram number 406 - FFV1_0( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 407 OF 1240 *** - - // Wavefunction(s) for diagram number 407 - // (none) - - // Amplitude(s) for diagram number 407 - FFV1_0( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 408 OF 1240 *** - - // Wavefunction(s) for diagram number 408 - // (none) - - // Amplitude(s) for diagram number 408 - VVVV1_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 409 OF 1240 *** - - // Wavefunction(s) for diagram number 409 - VVV1P0_1( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 409 - VVV1_0( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 410 OF 1240 *** - - // Wavefunction(s) for diagram number 410 - VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] ); - - // Amplitude(s) for diagram number 410 - VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 411 OF 1240 *** - - // Wavefunction(s) for diagram number 411 - // (none) - - // Amplitude(s) for diagram number 411 - VVV1_0( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 412 OF 1240 *** - - // Wavefunction(s) for diagram number 412 - // (none) - - // Amplitude(s) for diagram number 412 - FFV1_0( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 413 OF 1240 *** - - // Wavefunction(s) for diagram number 413 - // (none) - - // Amplitude(s) for diagram number 413 - FFV1_0( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 414 OF 1240 *** - - // Wavefunction(s) for diagram number 414 - // (none) - - // Amplitude(s) for diagram number 414 - FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 415 OF 1240 *** - - // Wavefunction(s) for diagram number 415 - // (none) - - // Amplitude(s) for diagram number 415 - FFV1_0( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 416 OF 1240 *** - - // Wavefunction(s) for diagram number 416 - // (none) - - // Amplitude(s) for diagram number 416 - FFV1_0( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - - // *** DIAGRAM 417 OF 1240 *** - - // Wavefunction(s) for diagram number 417 - // (none) - - // Amplitude(s) for diagram number 417 - FFV1_0( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - - // *** DIAGRAM 418 OF 1240 *** - - // Wavefunction(s) for diagram number 418 - // (none) - - // Amplitude(s) for diagram number 418 - FFV1_0( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - - // *** DIAGRAM 419 OF 1240 *** - - // Wavefunction(s) for diagram number 419 - // (none) - - // Amplitude(s) for diagram number 419 - FFV1_0( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 420 OF 1240 *** - - // Wavefunction(s) for diagram number 420 - // (none) - - // Amplitude(s) for diagram number 420 - FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 421 OF 1240 *** - - // Wavefunction(s) for diagram number 421 - // (none) - - // Amplitude(s) for diagram number 421 - FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 422 OF 1240 *** - - // Wavefunction(s) for diagram number 422 - // (none) - - // Amplitude(s) for diagram number 422 - FFV1_0( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 423 OF 1240 *** - - // Wavefunction(s) for diagram number 423 - // (none) - - // Amplitude(s) for diagram number 423 - FFV1_0( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 424 OF 1240 *** - - // Wavefunction(s) for diagram number 424 - // (none) - - // Amplitude(s) for diagram number 424 - VVVV1_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 425 OF 1240 *** - - // Wavefunction(s) for diagram number 425 - VVV1P0_1( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 425 - VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - - // *** DIAGRAM 426 OF 1240 *** - - // Wavefunction(s) for diagram number 426 - // (none) - - // Amplitude(s) for diagram number 426 - VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 427 OF 1240 *** - - // Wavefunction(s) for diagram number 427 - // (none) - - // Amplitude(s) for diagram number 427 - VVV1_0( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 428 OF 1240 *** - - // Wavefunction(s) for diagram number 428 - // (none) - - // Amplitude(s) for diagram number 428 - FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 429 OF 1240 *** - - // Wavefunction(s) for diagram number 429 - // (none) - - // Amplitude(s) for diagram number 429 - FFV1_0( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - - // *** DIAGRAM 430 OF 1240 *** - - // Wavefunction(s) for diagram number 430 - // (none) - - // Amplitude(s) for diagram number 430 - FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - - // *** DIAGRAM 431 OF 1240 *** - - // Wavefunction(s) for diagram number 431 - // (none) - - // Amplitude(s) for diagram number 431 - FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 432 OF 1240 *** - - // Wavefunction(s) for diagram number 432 - // (none) - - // Amplitude(s) for diagram number 432 - FFV1_0( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - - // *** DIAGRAM 433 OF 1240 *** - - // Wavefunction(s) for diagram number 433 - // (none) - - // Amplitude(s) for diagram number 433 - FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - - // *** DIAGRAM 434 OF 1240 *** - - // Wavefunction(s) for diagram number 434 - VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 434 - VVV1_0( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 435 OF 1240 *** - - // Wavefunction(s) for diagram number 435 - // (none) - - // Amplitude(s) for diagram number 435 - VVV1_0( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 436 OF 1240 *** - - // Wavefunction(s) for diagram number 436 - // (none) - - // Amplitude(s) for diagram number 436 - VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 437 OF 1240 *** - - // Wavefunction(s) for diagram number 437 - VVV1P0_1( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] ); - - // Amplitude(s) for diagram number 437 - VVV1_0( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 438 OF 1240 *** - - // Wavefunction(s) for diagram number 438 - // (none) - - // Amplitude(s) for diagram number 438 - VVV1_0( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 439 OF 1240 *** - - // Wavefunction(s) for diagram number 439 - // (none) - - // Amplitude(s) for diagram number 439 - VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 440 OF 1240 *** - - // Wavefunction(s) for diagram number 440 - // (none) - - // Amplitude(s) for diagram number 440 - VVV1_0( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 441 OF 1240 *** - - // Wavefunction(s) for diagram number 441 - // (none) - - // Amplitude(s) for diagram number 441 - VVV1_0( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - - // *** DIAGRAM 442 OF 1240 *** - - // Wavefunction(s) for diagram number 442 - // (none) - - // Amplitude(s) for diagram number 442 - VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 443 OF 1240 *** - - // Wavefunction(s) for diagram number 443 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 443 - VVV1_0( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 444 OF 1240 *** - - // Wavefunction(s) for diagram number 444 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] ); - - // Amplitude(s) for diagram number 444 - VVV1_0( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 445 OF 1240 *** - - // Wavefunction(s) for diagram number 445 - // (none) - - // Amplitude(s) for diagram number 445 - VVV1_0( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 446 OF 1240 *** - - // Wavefunction(s) for diagram number 446 - // (none) - - // Amplitude(s) for diagram number 446 - VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 447 OF 1240 *** - - // Wavefunction(s) for diagram number 447 - // (none) - - // Amplitude(s) for diagram number 447 - VVV1_0( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 448 OF 1240 *** - - // Wavefunction(s) for diagram number 448 - // (none) - - // Amplitude(s) for diagram number 448 - VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 449 OF 1240 *** - - // Wavefunction(s) for diagram number 449 - // (none) - - // Amplitude(s) for diagram number 449 - VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 450 OF 1240 *** - - // Wavefunction(s) for diagram number 450 - // (none) - - // Amplitude(s) for diagram number 450 - VVV1_0( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 451 OF 1240 *** - - // Wavefunction(s) for diagram number 451 - // (none) - - // Amplitude(s) for diagram number 451 - FFV1_0( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[91] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - - // *** DIAGRAM 452 OF 1240 *** - - // Wavefunction(s) for diagram number 452 - // (none) - - // Amplitude(s) for diagram number 452 - FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 453 OF 1240 *** - - // Wavefunction(s) for diagram number 453 - // (none) - - // Amplitude(s) for diagram number 453 - FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 454 OF 1240 *** - - // Wavefunction(s) for diagram number 454 - // (none) - - // Amplitude(s) for diagram number 454 - FFV1_0( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - - // *** DIAGRAM 455 OF 1240 *** - - // Wavefunction(s) for diagram number 455 - // (none) - - // Amplitude(s) for diagram number 455 - VVV1_0( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 456 OF 1240 *** - - // Wavefunction(s) for diagram number 456 - // (none) - - // Amplitude(s) for diagram number 456 - FFV1_0( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 457 OF 1240 *** - - // Wavefunction(s) for diagram number 457 - // (none) - - // Amplitude(s) for diagram number 457 - FFV1_0( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - - // *** DIAGRAM 458 OF 1240 *** - - // Wavefunction(s) for diagram number 458 - // (none) - - // Amplitude(s) for diagram number 458 - FFV1_0( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 459 OF 1240 *** - - // Wavefunction(s) for diagram number 459 - // (none) - - // Amplitude(s) for diagram number 459 - FFV1_0( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 460 OF 1240 *** - - // Wavefunction(s) for diagram number 460 - // (none) - - // Amplitude(s) for diagram number 460 - VVV1_0( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 461 OF 1240 *** - - // Wavefunction(s) for diagram number 461 - // (none) - - // Amplitude(s) for diagram number 461 - FFV1_0( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[115] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 462 OF 1240 *** - - // Wavefunction(s) for diagram number 462 - // (none) - - // Amplitude(s) for diagram number 462 - FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 463 OF 1240 *** - - // Wavefunction(s) for diagram number 463 - // (none) - - // Amplitude(s) for diagram number 463 - FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 464 OF 1240 *** - - // Wavefunction(s) for diagram number 464 - // (none) - - // Amplitude(s) for diagram number 464 - FFV1_0( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 465 OF 1240 *** - - // Wavefunction(s) for diagram number 465 - // (none) - - // Amplitude(s) for diagram number 465 - VVV1_0( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 466 OF 1240 *** - - // Wavefunction(s) for diagram number 466 - // (none) - - // Amplitude(s) for diagram number 466 - FFV1_0( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 467 OF 1240 *** - - // Wavefunction(s) for diagram number 467 - // (none) - - // Amplitude(s) for diagram number 467 - FFV1_0( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 468 OF 1240 *** - - // Wavefunction(s) for diagram number 468 - // (none) - - // Amplitude(s) for diagram number 468 - FFV1_0( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 469 OF 1240 *** - - // Wavefunction(s) for diagram number 469 - // (none) - - // Amplitude(s) for diagram number 469 - FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 470 OF 1240 *** - - // Wavefunction(s) for diagram number 470 - // (none) - - // Amplitude(s) for diagram number 470 - VVV1_0( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 471 OF 1240 *** - - // Wavefunction(s) for diagram number 471 - // (none) - - // Amplitude(s) for diagram number 471 - FFV1_0( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - - // *** DIAGRAM 472 OF 1240 *** - - // Wavefunction(s) for diagram number 472 - // (none) - - // Amplitude(s) for diagram number 472 - FFV1_0( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 473 OF 1240 *** - - // Wavefunction(s) for diagram number 473 - // (none) - - // Amplitude(s) for diagram number 473 - FFV1_0( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 474 OF 1240 *** - - // Wavefunction(s) for diagram number 474 - // (none) - - // Amplitude(s) for diagram number 474 - FFV1_0( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - - // *** DIAGRAM 475 OF 1240 *** - - // Wavefunction(s) for diagram number 475 - // (none) - - // Amplitude(s) for diagram number 475 - VVV1_0( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 476 OF 1240 *** - - // Wavefunction(s) for diagram number 476 - // (none) - - // Amplitude(s) for diagram number 476 - FFV1_0( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 477 OF 1240 *** - - // Wavefunction(s) for diagram number 477 - // (none) - - // Amplitude(s) for diagram number 477 - VVV1_0( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 478 OF 1240 *** - - // Wavefunction(s) for diagram number 478 - // (none) - - // Amplitude(s) for diagram number 478 - FFV1_0( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - - // *** DIAGRAM 479 OF 1240 *** - - // Wavefunction(s) for diagram number 479 - // (none) - - // Amplitude(s) for diagram number 479 - FFV1_0( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 480 OF 1240 *** - - // Wavefunction(s) for diagram number 480 - // (none) - - // Amplitude(s) for diagram number 480 - FFV1_0( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 481 OF 1240 *** - - // Wavefunction(s) for diagram number 481 - // (none) - - // Amplitude(s) for diagram number 481 - FFV1_0( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - - // *** DIAGRAM 482 OF 1240 *** - - // Wavefunction(s) for diagram number 482 - // (none) - - // Amplitude(s) for diagram number 482 - VVV1_0( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 483 OF 1240 *** - - // Wavefunction(s) for diagram number 483 - // (none) - - // Amplitude(s) for diagram number 483 - FFV1_0( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 484 OF 1240 *** - - // Wavefunction(s) for diagram number 484 - // (none) - - // Amplitude(s) for diagram number 484 - FFV1_0( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 485 OF 1240 *** - - // Wavefunction(s) for diagram number 485 - // (none) - - // Amplitude(s) for diagram number 485 - FFV1_0( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 486 OF 1240 *** - - // Wavefunction(s) for diagram number 486 - // (none) - - // Amplitude(s) for diagram number 486 - FFV1_0( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 487 OF 1240 *** - - // Wavefunction(s) for diagram number 487 - // (none) - - // Amplitude(s) for diagram number 487 - FFV1_0( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - - // *** DIAGRAM 488 OF 1240 *** - - // Wavefunction(s) for diagram number 488 - // (none) - - // Amplitude(s) for diagram number 488 - FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 489 OF 1240 *** - - // Wavefunction(s) for diagram number 489 - // (none) - - // Amplitude(s) for diagram number 489 - FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 490 OF 1240 *** - - // Wavefunction(s) for diagram number 490 - // (none) - - // Amplitude(s) for diagram number 490 - FFV1_0( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 491 OF 1240 *** - - // Wavefunction(s) for diagram number 491 - // (none) - - // Amplitude(s) for diagram number 491 - FFV1_0( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 492 OF 1240 *** - - // Wavefunction(s) for diagram number 492 - // (none) - - // Amplitude(s) for diagram number 492 - VVV1_0( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVV1_0( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 493 OF 1240 *** - - // Wavefunction(s) for diagram number 493 - VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] ); - FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 493 - FFV1_0( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 494 OF 1240 *** - - // Wavefunction(s) for diagram number 494 - // (none) - - // Amplitude(s) for diagram number 494 - FFV1_0( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 495 OF 1240 *** - - // Wavefunction(s) for diagram number 495 - VVV1P0_1( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] ); - - // Amplitude(s) for diagram number 495 - VVV1_0( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 496 OF 1240 *** - - // Wavefunction(s) for diagram number 496 - // (none) - - // Amplitude(s) for diagram number 496 - FFV1_0( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - - // *** DIAGRAM 497 OF 1240 *** - - // Wavefunction(s) for diagram number 497 - VVV1P0_1( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 497 - VVV1_0( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 498 OF 1240 *** - - // Wavefunction(s) for diagram number 498 - // (none) - - // Amplitude(s) for diagram number 498 - FFV1_0( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - - // *** DIAGRAM 499 OF 1240 *** - - // Wavefunction(s) for diagram number 499 - VVVV1P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); - VVVV3P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] ); - VVVV4P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] ); - - // Amplitude(s) for diagram number 499 - FFV1_0( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 500 OF 1240 *** - - // Wavefunction(s) for diagram number 500 - FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); - - // Amplitude(s) for diagram number 500 - FFV1_0( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 501 OF 1240 *** - - // Wavefunction(s) for diagram number 501 - FFV1_2( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); - - // Amplitude(s) for diagram number 501 - FFV1_0( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 502 OF 1240 *** - - // Wavefunction(s) for diagram number 502 - // (none) - - // Amplitude(s) for diagram number 502 - FFV1_0( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - - // *** DIAGRAM 503 OF 1240 *** - - // Wavefunction(s) for diagram number 503 - // (none) - - // Amplitude(s) for diagram number 503 - FFV1_0( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 504 OF 1240 *** - - // Wavefunction(s) for diagram number 504 - FFV1_2( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] ); - - // Amplitude(s) for diagram number 504 - FFV1_0( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 505 OF 1240 *** - - // Wavefunction(s) for diagram number 505 - // (none) - - // Amplitude(s) for diagram number 505 - FFV1_0( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - - // *** DIAGRAM 506 OF 1240 *** - - // Wavefunction(s) for diagram number 506 - // (none) - - // Amplitude(s) for diagram number 506 - FFV1_0( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - - // *** DIAGRAM 507 OF 1240 *** - - // Wavefunction(s) for diagram number 507 - // (none) - - // Amplitude(s) for diagram number 507 - FFV1_0( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - - // *** DIAGRAM 508 OF 1240 *** - - // Wavefunction(s) for diagram number 508 - VVV1P0_1( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] ); - - // Amplitude(s) for diagram number 508 - FFV1_0( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 509 OF 1240 *** - - // Wavefunction(s) for diagram number 509 - FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] ); - - // Amplitude(s) for diagram number 509 - FFV1_0( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 510 OF 1240 *** - - // Wavefunction(s) for diagram number 510 - // (none) - - // Amplitude(s) for diagram number 510 - FFV1_0( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 511 OF 1240 *** - - // Wavefunction(s) for diagram number 511 - // (none) - - // Amplitude(s) for diagram number 511 - VVV1_0( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 512 OF 1240 *** - - // Wavefunction(s) for diagram number 512 - // (none) - - // Amplitude(s) for diagram number 512 - FFV1_0( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - - // *** DIAGRAM 513 OF 1240 *** - - // Wavefunction(s) for diagram number 513 - // (none) - - // Amplitude(s) for diagram number 513 - VVV1_0( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 514 OF 1240 *** - - // Wavefunction(s) for diagram number 514 - // (none) - - // Amplitude(s) for diagram number 514 - FFV1_0( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 515 OF 1240 *** - - // Wavefunction(s) for diagram number 515 - // (none) - - // Amplitude(s) for diagram number 515 - FFV1_0( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 516 OF 1240 *** - - // Wavefunction(s) for diagram number 516 - FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] ); - - // Amplitude(s) for diagram number 516 - FFV1_0( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 517 OF 1240 *** - - // Wavefunction(s) for diagram number 517 - FFV1_1( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - - // Amplitude(s) for diagram number 517 - FFV1_0( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 518 OF 1240 *** - - // Wavefunction(s) for diagram number 518 - // (none) - - // Amplitude(s) for diagram number 518 - FFV1_0( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - - // *** DIAGRAM 519 OF 1240 *** - - // Wavefunction(s) for diagram number 519 - // (none) - - // Amplitude(s) for diagram number 519 - FFV1_0( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 520 OF 1240 *** - - // Wavefunction(s) for diagram number 520 - FFV1_1( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); - - // Amplitude(s) for diagram number 520 - FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 521 OF 1240 *** - - // Wavefunction(s) for diagram number 521 - // (none) - - // Amplitude(s) for diagram number 521 - FFV1_0( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[101] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 522 OF 1240 *** - - // Wavefunction(s) for diagram number 522 - // (none) - - // Amplitude(s) for diagram number 522 - FFV1_0( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 523 OF 1240 *** - - // Wavefunction(s) for diagram number 523 - // (none) - - // Amplitude(s) for diagram number 523 - FFV1_0( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - - // *** DIAGRAM 524 OF 1240 *** - - // Wavefunction(s) for diagram number 524 - // (none) - - // Amplitude(s) for diagram number 524 - FFV1_0( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 525 OF 1240 *** - - // Wavefunction(s) for diagram number 525 - // (none) - - // Amplitude(s) for diagram number 525 - FFV1_0( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - - // *** DIAGRAM 526 OF 1240 *** - - // Wavefunction(s) for diagram number 526 - // (none) - - // Amplitude(s) for diagram number 526 - FFV1_0( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 527 OF 1240 *** - - // Wavefunction(s) for diagram number 527 - // (none) - - // Amplitude(s) for diagram number 527 - FFV1_0( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - - // *** DIAGRAM 528 OF 1240 *** - - // Wavefunction(s) for diagram number 528 - // (none) - - // Amplitude(s) for diagram number 528 - FFV1_0( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 529 OF 1240 *** - - // Wavefunction(s) for diagram number 529 - // (none) - - // Amplitude(s) for diagram number 529 - FFV1_0( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 530 OF 1240 *** - - // Wavefunction(s) for diagram number 530 - // (none) - - // Amplitude(s) for diagram number 530 - FFV1_0( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 531 OF 1240 *** - - // Wavefunction(s) for diagram number 531 - // (none) - - // Amplitude(s) for diagram number 531 - VVVV1_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 532 OF 1240 *** - - // Wavefunction(s) for diagram number 532 - VVV1P0_1( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 532 - VVV1_0( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 533 OF 1240 *** - - // Wavefunction(s) for diagram number 533 - VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] ); - - // Amplitude(s) for diagram number 533 - VVV1_0( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 534 OF 1240 *** - - // Wavefunction(s) for diagram number 534 - // (none) - - // Amplitude(s) for diagram number 534 - VVV1_0( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 535 OF 1240 *** - - // Wavefunction(s) for diagram number 535 - // (none) - - // Amplitude(s) for diagram number 535 - FFV1_0( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 536 OF 1240 *** - - // Wavefunction(s) for diagram number 536 - // (none) - - // Amplitude(s) for diagram number 536 - FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 537 OF 1240 *** - - // Wavefunction(s) for diagram number 537 - // (none) - - // Amplitude(s) for diagram number 537 - FFV1_0( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 538 OF 1240 *** - - // Wavefunction(s) for diagram number 538 - // (none) - - // Amplitude(s) for diagram number 538 - FFV1_0( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 539 OF 1240 *** - - // Wavefunction(s) for diagram number 539 - // (none) - - // Amplitude(s) for diagram number 539 - FFV1_0( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - - // *** DIAGRAM 540 OF 1240 *** - - // Wavefunction(s) for diagram number 540 - // (none) - - // Amplitude(s) for diagram number 540 - FFV1_0( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - - // *** DIAGRAM 541 OF 1240 *** - - // Wavefunction(s) for diagram number 541 - // (none) - - // Amplitude(s) for diagram number 541 - FFV1_0( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - - // *** DIAGRAM 542 OF 1240 *** - - // Wavefunction(s) for diagram number 542 - // (none) - - // Amplitude(s) for diagram number 542 - FFV1_0( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 543 OF 1240 *** - - // Wavefunction(s) for diagram number 543 - // (none) - - // Amplitude(s) for diagram number 543 - FFV1_0( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - - // *** DIAGRAM 544 OF 1240 *** - - // Wavefunction(s) for diagram number 544 - // (none) - - // Amplitude(s) for diagram number 544 - FFV1_0( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 545 OF 1240 *** - - // Wavefunction(s) for diagram number 545 - // (none) - - // Amplitude(s) for diagram number 545 - FFV1_0( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 546 OF 1240 *** - - // Wavefunction(s) for diagram number 546 - // (none) - - // Amplitude(s) for diagram number 546 - FFV1_0( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 547 OF 1240 *** - - // Wavefunction(s) for diagram number 547 - // (none) - - // Amplitude(s) for diagram number 547 - VVVV1_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - - // *** DIAGRAM 548 OF 1240 *** - - // Wavefunction(s) for diagram number 548 - VVV1P0_1( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 548 - VVV1_0( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - - // *** DIAGRAM 549 OF 1240 *** - - // Wavefunction(s) for diagram number 549 - // (none) - - // Amplitude(s) for diagram number 549 - VVV1_0( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - - // *** DIAGRAM 550 OF 1240 *** - - // Wavefunction(s) for diagram number 550 - // (none) - - // Amplitude(s) for diagram number 550 - VVV1_0( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - - // *** DIAGRAM 551 OF 1240 *** - - // Wavefunction(s) for diagram number 551 - // (none) - - // Amplitude(s) for diagram number 551 - FFV1_0( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 552 OF 1240 *** - - // Wavefunction(s) for diagram number 552 - // (none) - - // Amplitude(s) for diagram number 552 - FFV1_0( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - - // *** DIAGRAM 553 OF 1240 *** - - // Wavefunction(s) for diagram number 553 - // (none) - - // Amplitude(s) for diagram number 553 - FFV1_0( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - - // *** DIAGRAM 554 OF 1240 *** - - // Wavefunction(s) for diagram number 554 - // (none) - - // Amplitude(s) for diagram number 554 - FFV1_0( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 555 OF 1240 *** - - // Wavefunction(s) for diagram number 555 - // (none) - - // Amplitude(s) for diagram number 555 - FFV1_0( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - - // *** DIAGRAM 556 OF 1240 *** - - // Wavefunction(s) for diagram number 556 - // (none) - - // Amplitude(s) for diagram number 556 - FFV1_0( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - - // *** DIAGRAM 557 OF 1240 *** - - // Wavefunction(s) for diagram number 557 - VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 557 - VVV1_0( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 558 OF 1240 *** - - // Wavefunction(s) for diagram number 558 - // (none) - - // Amplitude(s) for diagram number 558 - VVV1_0( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 559 OF 1240 *** - - // Wavefunction(s) for diagram number 559 - // (none) - - // Amplitude(s) for diagram number 559 - VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 560 OF 1240 *** - - // Wavefunction(s) for diagram number 560 - // (none) - - // Amplitude(s) for diagram number 560 - VVV1_0( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 561 OF 1240 *** - - // Wavefunction(s) for diagram number 561 - // (none) - - // Amplitude(s) for diagram number 561 - VVV1_0( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 562 OF 1240 *** - - // Wavefunction(s) for diagram number 562 - // (none) - - // Amplitude(s) for diagram number 562 - VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 563 OF 1240 *** - - // Wavefunction(s) for diagram number 563 - // (none) - - // Amplitude(s) for diagram number 563 - VVV1_0( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 564 OF 1240 *** - - // Wavefunction(s) for diagram number 564 - // (none) - - // Amplitude(s) for diagram number 564 - VVV1_0( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - - // *** DIAGRAM 565 OF 1240 *** - - // Wavefunction(s) for diagram number 565 - // (none) - - // Amplitude(s) for diagram number 565 - VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 566 OF 1240 *** - - // Wavefunction(s) for diagram number 566 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] ); - - // Amplitude(s) for diagram number 566 - VVV1_0( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 567 OF 1240 *** - - // Wavefunction(s) for diagram number 567 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] ); - - // Amplitude(s) for diagram number 567 - VVV1_0( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 568 OF 1240 *** - - // Wavefunction(s) for diagram number 568 - // (none) - - // Amplitude(s) for diagram number 568 - VVV1_0( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 569 OF 1240 *** - - // Wavefunction(s) for diagram number 569 - // (none) - - // Amplitude(s) for diagram number 569 - VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 570 OF 1240 *** - - // Wavefunction(s) for diagram number 570 - // (none) - - // Amplitude(s) for diagram number 570 - VVV1_0( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - - // *** DIAGRAM 571 OF 1240 *** - - // Wavefunction(s) for diagram number 571 - // (none) - - // Amplitude(s) for diagram number 571 - VVV1_0( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 572 OF 1240 *** - - // Wavefunction(s) for diagram number 572 - // (none) - - // Amplitude(s) for diagram number 572 - VVV1_0( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 573 OF 1240 *** - - // Wavefunction(s) for diagram number 573 - // (none) - - // Amplitude(s) for diagram number 573 - VVV1_0( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 574 OF 1240 *** - - // Wavefunction(s) for diagram number 574 - // (none) - - // Amplitude(s) for diagram number 574 - FFV1_0( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[67] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - - // *** DIAGRAM 575 OF 1240 *** - - // Wavefunction(s) for diagram number 575 - // (none) - - // Amplitude(s) for diagram number 575 - FFV1_0( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 576 OF 1240 *** - - // Wavefunction(s) for diagram number 576 - // (none) - - // Amplitude(s) for diagram number 576 - FFV1_0( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 577 OF 1240 *** - - // Wavefunction(s) for diagram number 577 - // (none) - - // Amplitude(s) for diagram number 577 - FFV1_0( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - - // *** DIAGRAM 578 OF 1240 *** - - // Wavefunction(s) for diagram number 578 - // (none) - - // Amplitude(s) for diagram number 578 - VVV1_0( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 579 OF 1240 *** - - // Wavefunction(s) for diagram number 579 - // (none) - - // Amplitude(s) for diagram number 579 - FFV1_0( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 580 OF 1240 *** - - // Wavefunction(s) for diagram number 580 - // (none) - - // Amplitude(s) for diagram number 580 - FFV1_0( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - - // *** DIAGRAM 581 OF 1240 *** - - // Wavefunction(s) for diagram number 581 - // (none) - - // Amplitude(s) for diagram number 581 - FFV1_0( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 582 OF 1240 *** - - // Wavefunction(s) for diagram number 582 - // (none) - - // Amplitude(s) for diagram number 582 - FFV1_0( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 583 OF 1240 *** - - // Wavefunction(s) for diagram number 583 - // (none) - - // Amplitude(s) for diagram number 583 - VVV1_0( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 584 OF 1240 *** - - // Wavefunction(s) for diagram number 584 - // (none) - - // Amplitude(s) for diagram number 584 - FFV1_0( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[109] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - - // *** DIAGRAM 585 OF 1240 *** - - // Wavefunction(s) for diagram number 585 - // (none) - - // Amplitude(s) for diagram number 585 - FFV1_0( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 586 OF 1240 *** - - // Wavefunction(s) for diagram number 586 - // (none) - - // Amplitude(s) for diagram number 586 - FFV1_0( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 587 OF 1240 *** - - // Wavefunction(s) for diagram number 587 - // (none) - - // Amplitude(s) for diagram number 587 - FFV1_0( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[103] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - - // *** DIAGRAM 588 OF 1240 *** - - // Wavefunction(s) for diagram number 588 - // (none) - - // Amplitude(s) for diagram number 588 - VVV1_0( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 589 OF 1240 *** - - // Wavefunction(s) for diagram number 589 - // (none) - - // Amplitude(s) for diagram number 589 - FFV1_0( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 590 OF 1240 *** - - // Wavefunction(s) for diagram number 590 - // (none) - - // Amplitude(s) for diagram number 590 - FFV1_0( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 591 OF 1240 *** - - // Wavefunction(s) for diagram number 591 - // (none) - - // Amplitude(s) for diagram number 591 - FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 592 OF 1240 *** - - // Wavefunction(s) for diagram number 592 - // (none) - - // Amplitude(s) for diagram number 592 - FFV1_0( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 593 OF 1240 *** - - // Wavefunction(s) for diagram number 593 - // (none) - - // Amplitude(s) for diagram number 593 - VVV1_0( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 594 OF 1240 *** - - // Wavefunction(s) for diagram number 594 - // (none) - - // Amplitude(s) for diagram number 594 - FFV1_0( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - - // *** DIAGRAM 595 OF 1240 *** - - // Wavefunction(s) for diagram number 595 - // (none) - - // Amplitude(s) for diagram number 595 - FFV1_0( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 596 OF 1240 *** - - // Wavefunction(s) for diagram number 596 - // (none) - - // Amplitude(s) for diagram number 596 - FFV1_0( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 597 OF 1240 *** - - // Wavefunction(s) for diagram number 597 - // (none) - - // Amplitude(s) for diagram number 597 - FFV1_0( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - - // *** DIAGRAM 598 OF 1240 *** - - // Wavefunction(s) for diagram number 598 - // (none) - - // Amplitude(s) for diagram number 598 - VVV1_0( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 599 OF 1240 *** - - // Wavefunction(s) for diagram number 599 - // (none) - - // Amplitude(s) for diagram number 599 - FFV1_0( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 600 OF 1240 *** - - // Wavefunction(s) for diagram number 600 - // (none) - - // Amplitude(s) for diagram number 600 - VVV1_0( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 601 OF 1240 *** - - // Wavefunction(s) for diagram number 601 - // (none) - - // Amplitude(s) for diagram number 601 - FFV1_0( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - - // *** DIAGRAM 602 OF 1240 *** - - // Wavefunction(s) for diagram number 602 - // (none) - - // Amplitude(s) for diagram number 602 - FFV1_0( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 603 OF 1240 *** - - // Wavefunction(s) for diagram number 603 - // (none) - - // Amplitude(s) for diagram number 603 - FFV1_0( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 604 OF 1240 *** - - // Wavefunction(s) for diagram number 604 - // (none) - - // Amplitude(s) for diagram number 604 - FFV1_0( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - - // *** DIAGRAM 605 OF 1240 *** - - // Wavefunction(s) for diagram number 605 - // (none) - - // Amplitude(s) for diagram number 605 - VVV1_0( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 606 OF 1240 *** - - // Wavefunction(s) for diagram number 606 - // (none) - - // Amplitude(s) for diagram number 606 - FFV1_0( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 607 OF 1240 *** - - // Wavefunction(s) for diagram number 607 - // (none) - - // Amplitude(s) for diagram number 607 - FFV1_0( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 608 OF 1240 *** - - // Wavefunction(s) for diagram number 608 - // (none) - - // Amplitude(s) for diagram number 608 - FFV1_0( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 609 OF 1240 *** - - // Wavefunction(s) for diagram number 609 - // (none) - - // Amplitude(s) for diagram number 609 - FFV1_0( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 610 OF 1240 *** - - // Wavefunction(s) for diagram number 610 - // (none) - - // Amplitude(s) for diagram number 610 - FFV1_0( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - - // *** DIAGRAM 611 OF 1240 *** - - // Wavefunction(s) for diagram number 611 - // (none) - - // Amplitude(s) for diagram number 611 - FFV1_0( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 612 OF 1240 *** - - // Wavefunction(s) for diagram number 612 - // (none) - - // Amplitude(s) for diagram number 612 - FFV1_0( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 613 OF 1240 *** - - // Wavefunction(s) for diagram number 613 - // (none) - - // Amplitude(s) for diagram number 613 - FFV1_0( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 614 OF 1240 *** - - // Wavefunction(s) for diagram number 614 - // (none) - - // Amplitude(s) for diagram number 614 - FFV1_0( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 615 OF 1240 *** - - // Wavefunction(s) for diagram number 615 - // (none) - - // Amplitude(s) for diagram number 615 - VVV1_0( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVV1_0( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVV1_0( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 616 OF 1240 *** - - // Wavefunction(s) for diagram number 616 - VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] ); - FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 616 - FFV1_0( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 617 OF 1240 *** - - // Wavefunction(s) for diagram number 617 - // (none) - - // Amplitude(s) for diagram number 617 - FFV1_0( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 618 OF 1240 *** - - // Wavefunction(s) for diagram number 618 - VVV1P0_1( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] ); - - // Amplitude(s) for diagram number 618 - VVV1_0( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 619 OF 1240 *** - - // Wavefunction(s) for diagram number 619 - // (none) - - // Amplitude(s) for diagram number 619 - FFV1_0( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - - // *** DIAGRAM 620 OF 1240 *** - - // Wavefunction(s) for diagram number 620 - VVV1P0_1( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 620 - VVV1_0( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 621 OF 1240 *** - - // Wavefunction(s) for diagram number 621 - // (none) - - // Amplitude(s) for diagram number 621 - FFV1_0( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - - // *** DIAGRAM 622 OF 1240 *** - - // Wavefunction(s) for diagram number 622 - VVVV1P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] ); - VVVV3P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] ); - VVVV4P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] ); - - // Amplitude(s) for diagram number 622 - FFV1_0( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 623 OF 1240 *** - - // Wavefunction(s) for diagram number 623 - FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); - - // Amplitude(s) for diagram number 623 - FFV1_0( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 624 OF 1240 *** - - // Wavefunction(s) for diagram number 624 - FFV1_2( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] ); - - // Amplitude(s) for diagram number 624 - FFV1_0( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 625 OF 1240 *** - - // Wavefunction(s) for diagram number 625 - // (none) - - // Amplitude(s) for diagram number 625 - FFV1_0( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - - // *** DIAGRAM 626 OF 1240 *** - - // Wavefunction(s) for diagram number 626 - // (none) - - // Amplitude(s) for diagram number 626 - FFV1_0( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 627 OF 1240 *** - - // Wavefunction(s) for diagram number 627 - FFV1_2( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); - - // Amplitude(s) for diagram number 627 - FFV1_0( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 628 OF 1240 *** - - // Wavefunction(s) for diagram number 628 - // (none) - - // Amplitude(s) for diagram number 628 - FFV1_0( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - - // *** DIAGRAM 629 OF 1240 *** - - // Wavefunction(s) for diagram number 629 - // (none) - - // Amplitude(s) for diagram number 629 - FFV1_0( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - - // *** DIAGRAM 630 OF 1240 *** - - // Wavefunction(s) for diagram number 630 - // (none) - - // Amplitude(s) for diagram number 630 - FFV1_0( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - - // *** DIAGRAM 631 OF 1240 *** - - // Wavefunction(s) for diagram number 631 - VVV1P0_1( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] ); - - // Amplitude(s) for diagram number 631 - FFV1_0( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 632 OF 1240 *** - - // Wavefunction(s) for diagram number 632 - FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] ); - - // Amplitude(s) for diagram number 632 - FFV1_0( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 633 OF 1240 *** - - // Wavefunction(s) for diagram number 633 - // (none) - - // Amplitude(s) for diagram number 633 - FFV1_0( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 634 OF 1240 *** - - // Wavefunction(s) for diagram number 634 - // (none) - - // Amplitude(s) for diagram number 634 - VVV1_0( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 635 OF 1240 *** - - // Wavefunction(s) for diagram number 635 - // (none) - - // Amplitude(s) for diagram number 635 - FFV1_0( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 636 OF 1240 *** - - // Wavefunction(s) for diagram number 636 - // (none) - - // Amplitude(s) for diagram number 636 - VVV1_0( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 637 OF 1240 *** - - // Wavefunction(s) for diagram number 637 - // (none) - - // Amplitude(s) for diagram number 637 - FFV1_0( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 638 OF 1240 *** - - // Wavefunction(s) for diagram number 638 - // (none) - - // Amplitude(s) for diagram number 638 - FFV1_0( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 639 OF 1240 *** - - // Wavefunction(s) for diagram number 639 - FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); - - // Amplitude(s) for diagram number 639 - FFV1_0( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 640 OF 1240 *** - - // Wavefunction(s) for diagram number 640 - FFV1_1( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); - - // Amplitude(s) for diagram number 640 - FFV1_0( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 641 OF 1240 *** - - // Wavefunction(s) for diagram number 641 - // (none) - - // Amplitude(s) for diagram number 641 - FFV1_0( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[53] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - - // *** DIAGRAM 642 OF 1240 *** - - // Wavefunction(s) for diagram number 642 - // (none) - - // Amplitude(s) for diagram number 642 - FFV1_0( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 643 OF 1240 *** - - // Wavefunction(s) for diagram number 643 - FFV1_1( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); - - // Amplitude(s) for diagram number 643 - FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 644 OF 1240 *** - - // Wavefunction(s) for diagram number 644 - // (none) - - // Amplitude(s) for diagram number 644 - FFV1_0( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[77] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - - // *** DIAGRAM 645 OF 1240 *** - - // Wavefunction(s) for diagram number 645 - // (none) - - // Amplitude(s) for diagram number 645 - FFV1_0( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - - // *** DIAGRAM 646 OF 1240 *** - - // Wavefunction(s) for diagram number 646 - // (none) - - // Amplitude(s) for diagram number 646 - FFV1_0( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - - // *** DIAGRAM 647 OF 1240 *** - - // Wavefunction(s) for diagram number 647 - // (none) - - // Amplitude(s) for diagram number 647 - FFV1_0( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 648 OF 1240 *** - - // Wavefunction(s) for diagram number 648 - // (none) - - // Amplitude(s) for diagram number 648 - FFV1_0( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - - // *** DIAGRAM 649 OF 1240 *** - - // Wavefunction(s) for diagram number 649 - // (none) - - // Amplitude(s) for diagram number 649 - FFV1_0( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 650 OF 1240 *** - - // Wavefunction(s) for diagram number 650 - // (none) - - // Amplitude(s) for diagram number 650 - FFV1_0( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - - // *** DIAGRAM 651 OF 1240 *** - - // Wavefunction(s) for diagram number 651 - // (none) - - // Amplitude(s) for diagram number 651 - FFV1_0( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 652 OF 1240 *** - - // Wavefunction(s) for diagram number 652 - // (none) - - // Amplitude(s) for diagram number 652 - FFV1_0( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 653 OF 1240 *** - - // Wavefunction(s) for diagram number 653 - // (none) - - // Amplitude(s) for diagram number 653 - FFV1_0( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 654 OF 1240 *** - - // Wavefunction(s) for diagram number 654 - // (none) - - // Amplitude(s) for diagram number 654 - VVVV1_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 655 OF 1240 *** - - // Wavefunction(s) for diagram number 655 - VVV1P0_1( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 655 - VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - - // *** DIAGRAM 656 OF 1240 *** - - // Wavefunction(s) for diagram number 656 - VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] ); - - // Amplitude(s) for diagram number 656 - VVV1_0( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 657 OF 1240 *** - - // Wavefunction(s) for diagram number 657 - // (none) - - // Amplitude(s) for diagram number 657 - VVV1_0( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 658 OF 1240 *** - - // Wavefunction(s) for diagram number 658 - // (none) - - // Amplitude(s) for diagram number 658 - FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 659 OF 1240 *** - - // Wavefunction(s) for diagram number 659 - // (none) - - // Amplitude(s) for diagram number 659 - FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - - // *** DIAGRAM 660 OF 1240 *** - - // Wavefunction(s) for diagram number 660 - // (none) - - // Amplitude(s) for diagram number 660 - FFV1_0( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - - // *** DIAGRAM 661 OF 1240 *** - - // Wavefunction(s) for diagram number 661 - // (none) - - // Amplitude(s) for diagram number 661 - FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 662 OF 1240 *** - - // Wavefunction(s) for diagram number 662 - // (none) - - // Amplitude(s) for diagram number 662 - FFV1_0( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - - // *** DIAGRAM 663 OF 1240 *** - - // Wavefunction(s) for diagram number 663 - // (none) - - // Amplitude(s) for diagram number 663 - FFV1_0( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - - // *** DIAGRAM 664 OF 1240 *** - - // Wavefunction(s) for diagram number 664 - // (none) - - // Amplitude(s) for diagram number 664 - FFV1_0( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - - // *** DIAGRAM 665 OF 1240 *** - - // Wavefunction(s) for diagram number 665 - // (none) - - // Amplitude(s) for diagram number 665 - FFV1_0( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 666 OF 1240 *** - - // Wavefunction(s) for diagram number 666 - // (none) - - // Amplitude(s) for diagram number 666 - FFV1_0( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - - // *** DIAGRAM 667 OF 1240 *** - - // Wavefunction(s) for diagram number 667 - // (none) - - // Amplitude(s) for diagram number 667 - FFV1_0( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 668 OF 1240 *** - - // Wavefunction(s) for diagram number 668 - // (none) - - // Amplitude(s) for diagram number 668 - FFV1_0( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 669 OF 1240 *** - - // Wavefunction(s) for diagram number 669 - // (none) - - // Amplitude(s) for diagram number 669 - FFV1_0( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 670 OF 1240 *** - - // Wavefunction(s) for diagram number 670 - // (none) - - // Amplitude(s) for diagram number 670 - VVVV1_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 671 OF 1240 *** - - // Wavefunction(s) for diagram number 671 - VVV1P0_1( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 671 - VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - - // *** DIAGRAM 672 OF 1240 *** - - // Wavefunction(s) for diagram number 672 - // (none) - - // Amplitude(s) for diagram number 672 - VVV1_0( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - - // *** DIAGRAM 673 OF 1240 *** - - // Wavefunction(s) for diagram number 673 - // (none) - - // Amplitude(s) for diagram number 673 - VVV1_0( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 674 OF 1240 *** - - // Wavefunction(s) for diagram number 674 - // (none) - - // Amplitude(s) for diagram number 674 - FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 675 OF 1240 *** - - // Wavefunction(s) for diagram number 675 - // (none) - - // Amplitude(s) for diagram number 675 - FFV1_0( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - - // *** DIAGRAM 676 OF 1240 *** - - // Wavefunction(s) for diagram number 676 - // (none) - - // Amplitude(s) for diagram number 676 - FFV1_0( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - - // *** DIAGRAM 677 OF 1240 *** - - // Wavefunction(s) for diagram number 677 - // (none) - - // Amplitude(s) for diagram number 677 - FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 678 OF 1240 *** - - // Wavefunction(s) for diagram number 678 - // (none) - - // Amplitude(s) for diagram number 678 - FFV1_0( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - - // *** DIAGRAM 679 OF 1240 *** - - // Wavefunction(s) for diagram number 679 - // (none) - - // Amplitude(s) for diagram number 679 - FFV1_0( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - - // *** DIAGRAM 680 OF 1240 *** - - // Wavefunction(s) for diagram number 680 - VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] ); - - // Amplitude(s) for diagram number 680 - VVV1_0( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - - // *** DIAGRAM 681 OF 1240 *** - - // Wavefunction(s) for diagram number 681 - // (none) - - // Amplitude(s) for diagram number 681 - VVV1_0( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - - // *** DIAGRAM 682 OF 1240 *** - - // Wavefunction(s) for diagram number 682 - // (none) - - // Amplitude(s) for diagram number 682 - VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - - // *** DIAGRAM 683 OF 1240 *** - - // Wavefunction(s) for diagram number 683 - // (none) - - // Amplitude(s) for diagram number 683 - VVV1_0( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 684 OF 1240 *** - - // Wavefunction(s) for diagram number 684 - // (none) - - // Amplitude(s) for diagram number 684 - VVV1_0( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - - // *** DIAGRAM 685 OF 1240 *** - - // Wavefunction(s) for diagram number 685 - // (none) - - // Amplitude(s) for diagram number 685 - VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - - // *** DIAGRAM 686 OF 1240 *** - - // Wavefunction(s) for diagram number 686 - // (none) - - // Amplitude(s) for diagram number 686 - VVV1_0( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 687 OF 1240 *** - - // Wavefunction(s) for diagram number 687 - // (none) - - // Amplitude(s) for diagram number 687 - VVV1_0( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - - // *** DIAGRAM 688 OF 1240 *** - - // Wavefunction(s) for diagram number 688 - // (none) - - // Amplitude(s) for diagram number 688 - VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - - // *** DIAGRAM 689 OF 1240 *** - - // Wavefunction(s) for diagram number 689 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] ); - - // Amplitude(s) for diagram number 689 - VVV1_0( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - - // *** DIAGRAM 690 OF 1240 *** - - // Wavefunction(s) for diagram number 690 - VVVV1P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); - VVVV3P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); - VVVV4P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 690 - VVV1_0( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - - // *** DIAGRAM 691 OF 1240 *** - - // Wavefunction(s) for diagram number 691 - // (none) - - // Amplitude(s) for diagram number 691 - VVV1_0( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 692 OF 1240 *** - - // Wavefunction(s) for diagram number 692 - // (none) - - // Amplitude(s) for diagram number 692 - VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 693 OF 1240 *** - - // Wavefunction(s) for diagram number 693 - // (none) - - // Amplitude(s) for diagram number 693 - VVV1_0( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - - // *** DIAGRAM 694 OF 1240 *** - - // Wavefunction(s) for diagram number 694 - // (none) - - // Amplitude(s) for diagram number 694 - VVV1_0( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 695 OF 1240 *** - - // Wavefunction(s) for diagram number 695 - // (none) - - // Amplitude(s) for diagram number 695 - VVV1_0( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 696 OF 1240 *** - - // Wavefunction(s) for diagram number 696 - // (none) - - // Amplitude(s) for diagram number 696 - VVV1_0( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 697 OF 1240 *** - - // Wavefunction(s) for diagram number 697 - // (none) - - // Amplitude(s) for diagram number 697 - FFV1_0( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[61] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - - // *** DIAGRAM 698 OF 1240 *** - - // Wavefunction(s) for diagram number 698 - // (none) - - // Amplitude(s) for diagram number 698 - FFV1_0( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 699 OF 1240 *** - - // Wavefunction(s) for diagram number 699 - // (none) - - // Amplitude(s) for diagram number 699 - FFV1_0( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 700 OF 1240 *** - - // Wavefunction(s) for diagram number 700 - // (none) - - // Amplitude(s) for diagram number 700 - FFV1_0( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[55] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - - // *** DIAGRAM 701 OF 1240 *** - - // Wavefunction(s) for diagram number 701 - // (none) - - // Amplitude(s) for diagram number 701 - VVV1_0( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 702 OF 1240 *** - - // Wavefunction(s) for diagram number 702 - // (none) - - // Amplitude(s) for diagram number 702 - FFV1_0( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 703 OF 1240 *** - - // Wavefunction(s) for diagram number 703 - // (none) - - // Amplitude(s) for diagram number 703 - FFV1_0( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - - // *** DIAGRAM 704 OF 1240 *** - - // Wavefunction(s) for diagram number 704 - // (none) - - // Amplitude(s) for diagram number 704 - FFV1_0( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 705 OF 1240 *** - - // Wavefunction(s) for diagram number 705 - // (none) - - // Amplitude(s) for diagram number 705 - FFV1_0( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 706 OF 1240 *** - - // Wavefunction(s) for diagram number 706 - // (none) - - // Amplitude(s) for diagram number 706 - VVV1_0( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 707 OF 1240 *** - - // Wavefunction(s) for diagram number 707 - // (none) - - // Amplitude(s) for diagram number 707 - FFV1_0( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[85] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - - // *** DIAGRAM 708 OF 1240 *** - - // Wavefunction(s) for diagram number 708 - // (none) - - // Amplitude(s) for diagram number 708 - FFV1_0( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 709 OF 1240 *** - - // Wavefunction(s) for diagram number 709 - // (none) - - // Amplitude(s) for diagram number 709 - FFV1_0( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 710 OF 1240 *** - - // Wavefunction(s) for diagram number 710 - // (none) - - // Amplitude(s) for diagram number 710 - FFV1_0( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[79] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - - // *** DIAGRAM 711 OF 1240 *** - - // Wavefunction(s) for diagram number 711 - // (none) - - // Amplitude(s) for diagram number 711 - VVV1_0( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 712 OF 1240 *** - - // Wavefunction(s) for diagram number 712 - // (none) - - // Amplitude(s) for diagram number 712 - FFV1_0( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 713 OF 1240 *** - - // Wavefunction(s) for diagram number 713 - // (none) - - // Amplitude(s) for diagram number 713 - FFV1_0( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - - // *** DIAGRAM 714 OF 1240 *** - - // Wavefunction(s) for diagram number 714 - // (none) - - // Amplitude(s) for diagram number 714 - FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 715 OF 1240 *** - - // Wavefunction(s) for diagram number 715 - // (none) - - // Amplitude(s) for diagram number 715 - FFV1_0( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 716 OF 1240 *** - - // Wavefunction(s) for diagram number 716 - // (none) - - // Amplitude(s) for diagram number 716 - VVV1_0( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 717 OF 1240 *** - - // Wavefunction(s) for diagram number 717 - // (none) - - // Amplitude(s) for diagram number 717 - FFV1_0( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - - // *** DIAGRAM 718 OF 1240 *** - - // Wavefunction(s) for diagram number 718 - // (none) - - // Amplitude(s) for diagram number 718 - FFV1_0( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 719 OF 1240 *** - - // Wavefunction(s) for diagram number 719 - // (none) - - // Amplitude(s) for diagram number 719 - FFV1_0( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 720 OF 1240 *** - - // Wavefunction(s) for diagram number 720 - // (none) - - // Amplitude(s) for diagram number 720 - FFV1_0( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - - // *** DIAGRAM 721 OF 1240 *** - - // Wavefunction(s) for diagram number 721 - // (none) - - // Amplitude(s) for diagram number 721 - VVV1_0( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 722 OF 1240 *** - - // Wavefunction(s) for diagram number 722 - // (none) - - // Amplitude(s) for diagram number 722 - FFV1_0( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 723 OF 1240 *** - - // Wavefunction(s) for diagram number 723 - // (none) - - // Amplitude(s) for diagram number 723 - VVV1_0( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 724 OF 1240 *** - - // Wavefunction(s) for diagram number 724 - // (none) - - // Amplitude(s) for diagram number 724 - FFV1_0( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - - // *** DIAGRAM 725 OF 1240 *** - - // Wavefunction(s) for diagram number 725 - // (none) - - // Amplitude(s) for diagram number 725 - FFV1_0( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 726 OF 1240 *** - - // Wavefunction(s) for diagram number 726 - // (none) - - // Amplitude(s) for diagram number 726 - FFV1_0( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 727 OF 1240 *** - - // Wavefunction(s) for diagram number 727 - // (none) - - // Amplitude(s) for diagram number 727 - FFV1_0( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - - // *** DIAGRAM 728 OF 1240 *** - - // Wavefunction(s) for diagram number 728 - // (none) - - // Amplitude(s) for diagram number 728 - VVV1_0( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 729 OF 1240 *** - - // Wavefunction(s) for diagram number 729 - // (none) - - // Amplitude(s) for diagram number 729 - FFV1_0( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 730 OF 1240 *** - - // Wavefunction(s) for diagram number 730 - // (none) - - // Amplitude(s) for diagram number 730 - FFV1_0( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 731 OF 1240 *** - - // Wavefunction(s) for diagram number 731 - // (none) - - // Amplitude(s) for diagram number 731 - FFV1_0( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 732 OF 1240 *** - - // Wavefunction(s) for diagram number 732 - // (none) - - // Amplitude(s) for diagram number 732 - FFV1_0( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 733 OF 1240 *** - - // Wavefunction(s) for diagram number 733 - // (none) - - // Amplitude(s) for diagram number 733 - FFV1_0( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - - // *** DIAGRAM 734 OF 1240 *** - - // Wavefunction(s) for diagram number 734 - // (none) - - // Amplitude(s) for diagram number 734 - FFV1_0( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 735 OF 1240 *** - - // Wavefunction(s) for diagram number 735 - // (none) - - // Amplitude(s) for diagram number 735 - FFV1_0( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - - // *** DIAGRAM 736 OF 1240 *** - - // Wavefunction(s) for diagram number 736 - // (none) - - // Amplitude(s) for diagram number 736 - FFV1_0( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 737 OF 1240 *** - - // Wavefunction(s) for diagram number 737 - // (none) - - // Amplitude(s) for diagram number 737 - FFV1_0( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 738 OF 1240 *** - - // Wavefunction(s) for diagram number 738 - // (none) - - // Amplitude(s) for diagram number 738 - VVV1_0( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVV1_0( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 739 OF 1240 *** - - // Wavefunction(s) for diagram number 739 - FFV1_1( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] ); - - // Amplitude(s) for diagram number 739 - FFV1_0( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[29] -= amp_sv[0]; - - // *** DIAGRAM 740 OF 1240 *** - - // Wavefunction(s) for diagram number 740 - // (none) - - // Amplitude(s) for diagram number 740 - FFV1_0( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] -= amp_sv[0]; - - // *** DIAGRAM 741 OF 1240 *** - - // Wavefunction(s) for diagram number 741 - FFV1_2( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); - - // Amplitude(s) for diagram number 741 - FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] -= amp_sv[0]; - - // *** DIAGRAM 742 OF 1240 *** - - // Wavefunction(s) for diagram number 742 - // (none) - - // Amplitude(s) for diagram number 742 - FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[46] -= amp_sv[0]; - - // *** DIAGRAM 743 OF 1240 *** - - // Wavefunction(s) for diagram number 743 - // (none) - - // Amplitude(s) for diagram number 743 - FFV1_0( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[37] -= amp_sv[0]; - - // *** DIAGRAM 744 OF 1240 *** - - // Wavefunction(s) for diagram number 744 - // (none) - - // Amplitude(s) for diagram number 744 - FFV1_0( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[43] -= amp_sv[0]; - - // *** DIAGRAM 745 OF 1240 *** - - // Wavefunction(s) for diagram number 745 - // (none) - - // Amplitude(s) for diagram number 745 - FFV1_0( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 746 OF 1240 *** - - // Wavefunction(s) for diagram number 746 - // (none) - - // Amplitude(s) for diagram number 746 - FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 747 OF 1240 *** - - // Wavefunction(s) for diagram number 747 - VVV1P0_1( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] ); - - // Amplitude(s) for diagram number 747 - FFV1_0( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - - // *** DIAGRAM 748 OF 1240 *** - - // Wavefunction(s) for diagram number 748 - // (none) - - // Amplitude(s) for diagram number 748 - FFV1_0( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] -= amp_sv[0]; - - // *** DIAGRAM 749 OF 1240 *** - - // Wavefunction(s) for diagram number 749 - // (none) - - // Amplitude(s) for diagram number 749 - FFV1_0( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] -= amp_sv[0]; - - // *** DIAGRAM 750 OF 1240 *** - - // Wavefunction(s) for diagram number 750 - FFV1_2( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); - - // Amplitude(s) for diagram number 750 - FFV1_0( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] -= amp_sv[0]; - - // *** DIAGRAM 751 OF 1240 *** - - // Wavefunction(s) for diagram number 751 - // (none) - - // Amplitude(s) for diagram number 751 - FFV1_0( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[44] -= amp_sv[0]; - - // *** DIAGRAM 752 OF 1240 *** - - // Wavefunction(s) for diagram number 752 - // (none) - - // Amplitude(s) for diagram number 752 - FFV1_0( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[31] -= amp_sv[0]; - - // *** DIAGRAM 753 OF 1240 *** - - // Wavefunction(s) for diagram number 753 - // (none) - - // Amplitude(s) for diagram number 753 - FFV1_0( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] -= amp_sv[0]; - - // *** DIAGRAM 754 OF 1240 *** - - // Wavefunction(s) for diagram number 754 - // (none) - - // Amplitude(s) for diagram number 754 - FFV1_0( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 755 OF 1240 *** - - // Wavefunction(s) for diagram number 755 - // (none) - - // Amplitude(s) for diagram number 755 - FFV1_0( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 756 OF 1240 *** - - // Wavefunction(s) for diagram number 756 - VVV1P0_1( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] ); - - // Amplitude(s) for diagram number 756 - FFV1_0( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - - // *** DIAGRAM 757 OF 1240 *** - - // Wavefunction(s) for diagram number 757 - // (none) - - // Amplitude(s) for diagram number 757 - FFV1_0( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] -= amp_sv[0]; - - // *** DIAGRAM 758 OF 1240 *** - - // Wavefunction(s) for diagram number 758 - // (none) - - // Amplitude(s) for diagram number 758 - FFV1_0( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= amp_sv[0]; - - // *** DIAGRAM 759 OF 1240 *** - - // Wavefunction(s) for diagram number 759 - FFV1_2( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); - - // Amplitude(s) for diagram number 759 - FFV1_0( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] -= amp_sv[0]; - - // *** DIAGRAM 760 OF 1240 *** - - // Wavefunction(s) for diagram number 760 - // (none) - - // Amplitude(s) for diagram number 760 - FFV1_0( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[38] -= amp_sv[0]; - - // *** DIAGRAM 761 OF 1240 *** - - // Wavefunction(s) for diagram number 761 - // (none) - - // Amplitude(s) for diagram number 761 - FFV1_0( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] -= amp_sv[0]; - - // *** DIAGRAM 762 OF 1240 *** - - // Wavefunction(s) for diagram number 762 - // (none) - - // Amplitude(s) for diagram number 762 - FFV1_0( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] -= amp_sv[0]; - - // *** DIAGRAM 763 OF 1240 *** - - // Wavefunction(s) for diagram number 763 - // (none) - - // Amplitude(s) for diagram number 763 - FFV1_0( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 764 OF 1240 *** - - // Wavefunction(s) for diagram number 764 - // (none) - - // Amplitude(s) for diagram number 764 - FFV1_0( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 765 OF 1240 *** - - // Wavefunction(s) for diagram number 765 - VVV1P0_1( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] ); - - // Amplitude(s) for diagram number 765 - FFV1_0( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - - // *** DIAGRAM 766 OF 1240 *** - - // Wavefunction(s) for diagram number 766 - // (none) - - // Amplitude(s) for diagram number 766 - FFV1_0( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 767 OF 1240 *** - - // Wavefunction(s) for diagram number 767 - // (none) - - // Amplitude(s) for diagram number 767 - FFV1_0( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - - // *** DIAGRAM 768 OF 1240 *** - - // Wavefunction(s) for diagram number 768 - // (none) - - // Amplitude(s) for diagram number 768 - VVV1_0( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 769 OF 1240 *** - - // Wavefunction(s) for diagram number 769 - // (none) - - // Amplitude(s) for diagram number 769 - FFV1_0( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - - // *** DIAGRAM 770 OF 1240 *** - - // Wavefunction(s) for diagram number 770 - // (none) - - // Amplitude(s) for diagram number 770 - VVV1_0( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 771 OF 1240 *** - - // Wavefunction(s) for diagram number 771 - // (none) - - // Amplitude(s) for diagram number 771 - FFV1_0( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 772 OF 1240 *** - - // Wavefunction(s) for diagram number 772 - VVVV1P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] ); - VVVV3P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] ); - VVVV4P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 772 - FFV1_0( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 773 OF 1240 *** - - // Wavefunction(s) for diagram number 773 - // (none) - - // Amplitude(s) for diagram number 773 - FFV1_0( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 774 OF 1240 *** - - // Wavefunction(s) for diagram number 774 - // (none) - - // Amplitude(s) for diagram number 774 - FFV1_0( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - - // *** DIAGRAM 775 OF 1240 *** - - // Wavefunction(s) for diagram number 775 - // (none) - - // Amplitude(s) for diagram number 775 - VVV1_0( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 776 OF 1240 *** - - // Wavefunction(s) for diagram number 776 - // (none) - - // Amplitude(s) for diagram number 776 - FFV1_0( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - - // *** DIAGRAM 777 OF 1240 *** - - // Wavefunction(s) for diagram number 777 - // (none) - - // Amplitude(s) for diagram number 777 - VVV1_0( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 778 OF 1240 *** - - // Wavefunction(s) for diagram number 778 - // (none) - - // Amplitude(s) for diagram number 778 - FFV1_0( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 779 OF 1240 *** - - // Wavefunction(s) for diagram number 779 - VVVV1P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] ); - VVVV3P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); - VVVV4P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); - - // Amplitude(s) for diagram number 779 - FFV1_0( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 780 OF 1240 *** - - // Wavefunction(s) for diagram number 780 - // (none) - - // Amplitude(s) for diagram number 780 - FFV1_0( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 781 OF 1240 *** - - // Wavefunction(s) for diagram number 781 - // (none) - - // Amplitude(s) for diagram number 781 - FFV1_0( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - - // *** DIAGRAM 782 OF 1240 *** - - // Wavefunction(s) for diagram number 782 - // (none) - - // Amplitude(s) for diagram number 782 - VVV1_0( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 783 OF 1240 *** - - // Wavefunction(s) for diagram number 783 - // (none) - - // Amplitude(s) for diagram number 783 - FFV1_0( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - - // *** DIAGRAM 784 OF 1240 *** - - // Wavefunction(s) for diagram number 784 - // (none) - - // Amplitude(s) for diagram number 784 - VVV1_0( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 785 OF 1240 *** - - // Wavefunction(s) for diagram number 785 - // (none) - - // Amplitude(s) for diagram number 785 - FFV1_0( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 786 OF 1240 *** - - // Wavefunction(s) for diagram number 786 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] ); - - // Amplitude(s) for diagram number 786 - FFV1_0( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 787 OF 1240 *** - - // Wavefunction(s) for diagram number 787 - // (none) - - // Amplitude(s) for diagram number 787 - FFV1_0( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - - // *** DIAGRAM 788 OF 1240 *** - - // Wavefunction(s) for diagram number 788 - VVV1P0_1( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] ); - VVV1P0_1( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] ); - VVV1P0_1( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] ); - - // Amplitude(s) for diagram number 788 - FFV1_0( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 789 OF 1240 *** - - // Wavefunction(s) for diagram number 789 - FFV1_2( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); - - // Amplitude(s) for diagram number 789 - FFV1_0( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] -= amp_sv[0]; - - // *** DIAGRAM 790 OF 1240 *** - - // Wavefunction(s) for diagram number 790 - // (none) - - // Amplitude(s) for diagram number 790 - FFV1_0( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[70] -= amp_sv[0]; - - // *** DIAGRAM 791 OF 1240 *** - - // Wavefunction(s) for diagram number 791 - FFV1_1( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); - - // Amplitude(s) for diagram number 791 - FFV1_0( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[53] -= amp_sv[0]; - - // *** DIAGRAM 792 OF 1240 *** - - // Wavefunction(s) for diagram number 792 - // (none) - - // Amplitude(s) for diagram number 792 - FFV1_0( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] -= amp_sv[0]; - - // *** DIAGRAM 793 OF 1240 *** - - // Wavefunction(s) for diagram number 793 - // (none) - - // Amplitude(s) for diagram number 793 - FFV1_0( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[67] -= amp_sv[0]; - - // *** DIAGRAM 794 OF 1240 *** - - // Wavefunction(s) for diagram number 794 - // (none) - - // Amplitude(s) for diagram number 794 - FFV1_0( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[61] -= amp_sv[0]; - - // *** DIAGRAM 795 OF 1240 *** - - // Wavefunction(s) for diagram number 795 - // (none) - - // Amplitude(s) for diagram number 795 - FFV1_0( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 796 OF 1240 *** - - // Wavefunction(s) for diagram number 796 - // (none) - - // Amplitude(s) for diagram number 796 - FFV1_0( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 797 OF 1240 *** - - // Wavefunction(s) for diagram number 797 - // (none) - - // Amplitude(s) for diagram number 797 - FFV1_0( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - - // *** DIAGRAM 798 OF 1240 *** - - // Wavefunction(s) for diagram number 798 - // (none) - - // Amplitude(s) for diagram number 798 - FFV1_0( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[88] -= amp_sv[0]; - - // *** DIAGRAM 799 OF 1240 *** - - // Wavefunction(s) for diagram number 799 - // (none) - - // Amplitude(s) for diagram number 799 - FFV1_0( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[94] -= amp_sv[0]; - - // *** DIAGRAM 800 OF 1240 *** - - // Wavefunction(s) for diagram number 800 - FFV1_1( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); - - // Amplitude(s) for diagram number 800 - FFV1_0( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[77] -= amp_sv[0]; - - // *** DIAGRAM 801 OF 1240 *** - - // Wavefunction(s) for diagram number 801 - // (none) - - // Amplitude(s) for diagram number 801 - FFV1_0( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] -= amp_sv[0]; - - // *** DIAGRAM 802 OF 1240 *** - - // Wavefunction(s) for diagram number 802 - // (none) - - // Amplitude(s) for diagram number 802 - FFV1_0( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[91] -= amp_sv[0]; - - // *** DIAGRAM 803 OF 1240 *** - - // Wavefunction(s) for diagram number 803 - // (none) - - // Amplitude(s) for diagram number 803 - FFV1_0( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[85] -= amp_sv[0]; - - // *** DIAGRAM 804 OF 1240 *** - - // Wavefunction(s) for diagram number 804 - // (none) - - // Amplitude(s) for diagram number 804 - FFV1_0( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 805 OF 1240 *** - - // Wavefunction(s) for diagram number 805 - // (none) - - // Amplitude(s) for diagram number 805 - FFV1_0( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 806 OF 1240 *** - - // Wavefunction(s) for diagram number 806 - // (none) - - // Amplitude(s) for diagram number 806 - FFV1_0( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - - // *** DIAGRAM 807 OF 1240 *** - - // Wavefunction(s) for diagram number 807 - // (none) - - // Amplitude(s) for diagram number 807 - FFV1_0( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[112] -= amp_sv[0]; - - // *** DIAGRAM 808 OF 1240 *** - - // Wavefunction(s) for diagram number 808 - // (none) - - // Amplitude(s) for diagram number 808 - FFV1_0( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 809 OF 1240 *** - - // Wavefunction(s) for diagram number 809 - FFV1_1( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] ); - - // Amplitude(s) for diagram number 809 - FFV1_0( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 810 OF 1240 *** - - // Wavefunction(s) for diagram number 810 - // (none) - - // Amplitude(s) for diagram number 810 - FFV1_0( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] -= amp_sv[0]; - - // *** DIAGRAM 811 OF 1240 *** - - // Wavefunction(s) for diagram number 811 - // (none) - - // Amplitude(s) for diagram number 811 - FFV1_0( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 812 OF 1240 *** - - // Wavefunction(s) for diagram number 812 - // (none) - - // Amplitude(s) for diagram number 812 - FFV1_0( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[109] -= amp_sv[0]; - - // *** DIAGRAM 813 OF 1240 *** - - // Wavefunction(s) for diagram number 813 - // (none) - - // Amplitude(s) for diagram number 813 - FFV1_0( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 814 OF 1240 *** - - // Wavefunction(s) for diagram number 814 - // (none) - - // Amplitude(s) for diagram number 814 - FFV1_0( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 815 OF 1240 *** - - // Wavefunction(s) for diagram number 815 - // (none) - - // Amplitude(s) for diagram number 815 - FFV1_0( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 816 OF 1240 *** - - // Wavefunction(s) for diagram number 816 - // (none) - - // Amplitude(s) for diagram number 816 - FFV1_0( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 817 OF 1240 *** - - // Wavefunction(s) for diagram number 817 - // (none) - - // Amplitude(s) for diagram number 817 - FFV1_0( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 818 OF 1240 *** - - // Wavefunction(s) for diagram number 818 - // (none) - - // Amplitude(s) for diagram number 818 - VVV1_0( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 819 OF 1240 *** - - // Wavefunction(s) for diagram number 819 - // (none) - - // Amplitude(s) for diagram number 819 - FFV1_0( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - - // *** DIAGRAM 820 OF 1240 *** - - // Wavefunction(s) for diagram number 820 - // (none) - - // Amplitude(s) for diagram number 820 - VVV1_0( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 821 OF 1240 *** - - // Wavefunction(s) for diagram number 821 - // (none) - - // Amplitude(s) for diagram number 821 - FFV1_0( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 822 OF 1240 *** - - // Wavefunction(s) for diagram number 822 - // (none) - - // Amplitude(s) for diagram number 822 - FFV1_0( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 823 OF 1240 *** - - // Wavefunction(s) for diagram number 823 - // (none) - - // Amplitude(s) for diagram number 823 - FFV1_0( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 824 OF 1240 *** - - // Wavefunction(s) for diagram number 824 - // (none) - - // Amplitude(s) for diagram number 824 - FFV1_0( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[70] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - - // *** DIAGRAM 825 OF 1240 *** - - // Wavefunction(s) for diagram number 825 - // (none) - - // Amplitude(s) for diagram number 825 - VVV1_0( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 826 OF 1240 *** - - // Wavefunction(s) for diagram number 826 - // (none) - - // Amplitude(s) for diagram number 826 - FFV1_0( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 827 OF 1240 *** - - // Wavefunction(s) for diagram number 827 - // (none) - - // Amplitude(s) for diagram number 827 - VVV1_0( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 828 OF 1240 *** - - // Wavefunction(s) for diagram number 828 - // (none) - - // Amplitude(s) for diagram number 828 - FFV1_0( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 829 OF 1240 *** - - // Wavefunction(s) for diagram number 829 - // (none) - - // Amplitude(s) for diagram number 829 - FFV1_0( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 830 OF 1240 *** - - // Wavefunction(s) for diagram number 830 - // (none) - - // Amplitude(s) for diagram number 830 - FFV1_0( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 831 OF 1240 *** - - // Wavefunction(s) for diagram number 831 - // (none) - - // Amplitude(s) for diagram number 831 - FFV1_0( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 832 OF 1240 *** - - // Wavefunction(s) for diagram number 832 - // (none) - - // Amplitude(s) for diagram number 832 - VVV1_0( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 833 OF 1240 *** - - // Wavefunction(s) for diagram number 833 - // (none) - - // Amplitude(s) for diagram number 833 - FFV1_0( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 834 OF 1240 *** - - // Wavefunction(s) for diagram number 834 - // (none) - - // Amplitude(s) for diagram number 834 - VVV1_0( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 835 OF 1240 *** - - // Wavefunction(s) for diagram number 835 - // (none) - - // Amplitude(s) for diagram number 835 - FFV1_0( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 836 OF 1240 *** - - // Wavefunction(s) for diagram number 836 - // (none) - - // Amplitude(s) for diagram number 836 - FFV1_0( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 837 OF 1240 *** - - // Wavefunction(s) for diagram number 837 - // (none) - - // Amplitude(s) for diagram number 837 - FFV1_0( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - FFV1_0( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - FFV1_0( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 838 OF 1240 *** - - // Wavefunction(s) for diagram number 838 - // (none) - - // Amplitude(s) for diagram number 838 - FFV1_0( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 839 OF 1240 *** - - // Wavefunction(s) for diagram number 839 - VVV1P0_1( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] ); - - // Amplitude(s) for diagram number 839 - VVV1_0( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 840 OF 1240 *** - - // Wavefunction(s) for diagram number 840 - // (none) - - // Amplitude(s) for diagram number 840 - VVV1_0( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 841 OF 1240 *** - - // Wavefunction(s) for diagram number 841 - // (none) - - // Amplitude(s) for diagram number 841 - VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 842 OF 1240 *** - - // Wavefunction(s) for diagram number 842 - VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] ); - - // Amplitude(s) for diagram number 842 - VVV1_0( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 843 OF 1240 *** - - // Wavefunction(s) for diagram number 843 - // (none) - - // Amplitude(s) for diagram number 843 - VVV1_0( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 844 OF 1240 *** - - // Wavefunction(s) for diagram number 844 - // (none) - - // Amplitude(s) for diagram number 844 - VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 845 OF 1240 *** - - // Wavefunction(s) for diagram number 845 - // (none) - - // Amplitude(s) for diagram number 845 - VVV1_0( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 846 OF 1240 *** - - // Wavefunction(s) for diagram number 846 - // (none) - - // Amplitude(s) for diagram number 846 - VVV1_0( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 847 OF 1240 *** - - // Wavefunction(s) for diagram number 847 - VVVV1P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] ); - VVVV3P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] ); - VVVV4P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 847 - VVV1_0( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 848 OF 1240 *** - - // Wavefunction(s) for diagram number 848 - VVVV1P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] ); - VVVV3P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] ); - VVVV4P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] ); - - // Amplitude(s) for diagram number 848 - VVV1_0( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 849 OF 1240 *** - - // Wavefunction(s) for diagram number 849 - VVVV1P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] ); - VVVV3P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] ); - VVVV4P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] ); - - // Amplitude(s) for diagram number 849 - VVV1_0( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVV1_0( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVV1_0( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 850 OF 1240 *** - - // Wavefunction(s) for diagram number 850 - VVVV1P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] ); - VVVV3P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] ); - VVVV4P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] ); - - // Amplitude(s) for diagram number 850 - VVV1_0( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVV1_0( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 851 OF 1240 *** - - // Wavefunction(s) for diagram number 851 - // (none) - - // Amplitude(s) for diagram number 851 - VVVV1_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 852 OF 1240 *** - - // Wavefunction(s) for diagram number 852 - // (none) - - // Amplitude(s) for diagram number 852 - VVV1_0( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 853 OF 1240 *** - - // Wavefunction(s) for diagram number 853 - // (none) - - // Amplitude(s) for diagram number 853 - VVV1_0( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 854 OF 1240 *** - - // Wavefunction(s) for diagram number 854 - // (none) - - // Amplitude(s) for diagram number 854 - VVV1_0( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 855 OF 1240 *** - - // Wavefunction(s) for diagram number 855 - // (none) - - // Amplitude(s) for diagram number 855 - VVV1_0( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 856 OF 1240 *** - - // Wavefunction(s) for diagram number 856 - // (none) - - // Amplitude(s) for diagram number 856 - FFV1_0( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - - // *** DIAGRAM 857 OF 1240 *** - - // Wavefunction(s) for diagram number 857 - // (none) - - // Amplitude(s) for diagram number 857 - FFV1_0( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 858 OF 1240 *** - - // Wavefunction(s) for diagram number 858 - // (none) - - // Amplitude(s) for diagram number 858 - FFV1_0( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - - // *** DIAGRAM 859 OF 1240 *** - - // Wavefunction(s) for diagram number 859 - // (none) - - // Amplitude(s) for diagram number 859 - FFV1_0( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 860 OF 1240 *** - - // Wavefunction(s) for diagram number 860 - // (none) - - // Amplitude(s) for diagram number 860 - VVV1_0( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 861 OF 1240 *** - - // Wavefunction(s) for diagram number 861 - // (none) - - // Amplitude(s) for diagram number 861 - FFV1_0( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 862 OF 1240 *** - - // Wavefunction(s) for diagram number 862 - // (none) - - // Amplitude(s) for diagram number 862 - FFV1_0( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 863 OF 1240 *** - - // Wavefunction(s) for diagram number 863 - // (none) - - // Amplitude(s) for diagram number 863 - FFV1_0( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 864 OF 1240 *** - - // Wavefunction(s) for diagram number 864 - // (none) - - // Amplitude(s) for diagram number 864 - FFV1_0( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 865 OF 1240 *** - - // Wavefunction(s) for diagram number 865 - // (none) - - // Amplitude(s) for diagram number 865 - VVV1_0( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 866 OF 1240 *** - - // Wavefunction(s) for diagram number 866 - // (none) - - // Amplitude(s) for diagram number 866 - FFV1_0( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 867 OF 1240 *** - - // Wavefunction(s) for diagram number 867 - // (none) - - // Amplitude(s) for diagram number 867 - FFV1_0( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 868 OF 1240 *** - - // Wavefunction(s) for diagram number 868 - // (none) - - // Amplitude(s) for diagram number 868 - FFV1_0( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - - // *** DIAGRAM 869 OF 1240 *** - - // Wavefunction(s) for diagram number 869 - // (none) - - // Amplitude(s) for diagram number 869 - FFV1_0( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 870 OF 1240 *** - - // Wavefunction(s) for diagram number 870 - // (none) - - // Amplitude(s) for diagram number 870 - VVV1_0( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 871 OF 1240 *** - - // Wavefunction(s) for diagram number 871 - // (none) - - // Amplitude(s) for diagram number 871 - FFV1_0( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 872 OF 1240 *** - - // Wavefunction(s) for diagram number 872 - // (none) - - // Amplitude(s) for diagram number 872 - FFV1_0( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 873 OF 1240 *** - - // Wavefunction(s) for diagram number 873 - // (none) - - // Amplitude(s) for diagram number 873 - FFV1_0( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 874 OF 1240 *** - - // Wavefunction(s) for diagram number 874 - // (none) - - // Amplitude(s) for diagram number 874 - FFV1_0( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 875 OF 1240 *** - - // Wavefunction(s) for diagram number 875 - // (none) - - // Amplitude(s) for diagram number 875 - VVV1_0( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 876 OF 1240 *** - - // Wavefunction(s) for diagram number 876 - // (none) - - // Amplitude(s) for diagram number 876 - FFV1_0( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - - // *** DIAGRAM 877 OF 1240 *** - - // Wavefunction(s) for diagram number 877 - // (none) - - // Amplitude(s) for diagram number 877 - FFV1_0( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 878 OF 1240 *** - - // Wavefunction(s) for diagram number 878 - // (none) - - // Amplitude(s) for diagram number 878 - FFV1_0( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 879 OF 1240 *** - - // Wavefunction(s) for diagram number 879 - // (none) - - // Amplitude(s) for diagram number 879 - FFV1_0( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 880 OF 1240 *** - - // Wavefunction(s) for diagram number 880 - // (none) - - // Amplitude(s) for diagram number 880 - VVV1_0( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 881 OF 1240 *** - - // Wavefunction(s) for diagram number 881 - // (none) - - // Amplitude(s) for diagram number 881 - FFV1_0( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 882 OF 1240 *** - - // Wavefunction(s) for diagram number 882 - // (none) - - // Amplitude(s) for diagram number 882 - VVV1_0( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 883 OF 1240 *** - - // Wavefunction(s) for diagram number 883 - // (none) - - // Amplitude(s) for diagram number 883 - FFV1_0( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - - // *** DIAGRAM 884 OF 1240 *** - - // Wavefunction(s) for diagram number 884 - // (none) - - // Amplitude(s) for diagram number 884 - FFV1_0( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 885 OF 1240 *** - - // Wavefunction(s) for diagram number 885 - // (none) - - // Amplitude(s) for diagram number 885 - FFV1_0( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 886 OF 1240 *** - - // Wavefunction(s) for diagram number 886 - // (none) - - // Amplitude(s) for diagram number 886 - FFV1_0( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 887 OF 1240 *** - - // Wavefunction(s) for diagram number 887 - // (none) - - // Amplitude(s) for diagram number 887 - VVV1_0( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 888 OF 1240 *** - - // Wavefunction(s) for diagram number 888 - // (none) - - // Amplitude(s) for diagram number 888 - FFV1_0( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 889 OF 1240 *** - - // Wavefunction(s) for diagram number 889 - // (none) - - // Amplitude(s) for diagram number 889 - FFV1_0( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 890 OF 1240 *** - - // Wavefunction(s) for diagram number 890 - // (none) - - // Amplitude(s) for diagram number 890 - FFV1_0( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 891 OF 1240 *** - - // Wavefunction(s) for diagram number 891 - // (none) - - // Amplitude(s) for diagram number 891 - FFV1_0( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 892 OF 1240 *** - - // Wavefunction(s) for diagram number 892 - // (none) - - // Amplitude(s) for diagram number 892 - FFV1_0( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 893 OF 1240 *** - - // Wavefunction(s) for diagram number 893 - // (none) - - // Amplitude(s) for diagram number 893 - FFV1_0( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - - // *** DIAGRAM 894 OF 1240 *** - - // Wavefunction(s) for diagram number 894 - // (none) - - // Amplitude(s) for diagram number 894 - FFV1_0( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 895 OF 1240 *** - - // Wavefunction(s) for diagram number 895 - VVV1P0_1( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] ); - - // Amplitude(s) for diagram number 895 - VVV1_0( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 896 OF 1240 *** - - // Wavefunction(s) for diagram number 896 - // (none) - - // Amplitude(s) for diagram number 896 - VVV1_0( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 897 OF 1240 *** - - // Wavefunction(s) for diagram number 897 - // (none) - - // Amplitude(s) for diagram number 897 - VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 898 OF 1240 *** - - // Wavefunction(s) for diagram number 898 - // (none) - - // Amplitude(s) for diagram number 898 - VVV1_0( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 899 OF 1240 *** - - // Wavefunction(s) for diagram number 899 - // (none) - - // Amplitude(s) for diagram number 899 - VVV1_0( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 900 OF 1240 *** - - // Wavefunction(s) for diagram number 900 - // (none) - - // Amplitude(s) for diagram number 900 - VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 901 OF 1240 *** - - // Wavefunction(s) for diagram number 901 - // (none) - - // Amplitude(s) for diagram number 901 - VVV1_0( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 902 OF 1240 *** - - // Wavefunction(s) for diagram number 902 - // (none) - - // Amplitude(s) for diagram number 902 - VVV1_0( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 903 OF 1240 *** - - // Wavefunction(s) for diagram number 903 - VVVV1P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] ); - VVVV3P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] ); - VVVV4P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 903 - VVV1_0( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 904 OF 1240 *** - - // Wavefunction(s) for diagram number 904 - VVVV1P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] ); - VVVV3P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] ); - VVVV4P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] ); - - // Amplitude(s) for diagram number 904 - VVV1_0( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 905 OF 1240 *** - - // Wavefunction(s) for diagram number 905 - VVVV1P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] ); - VVVV3P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] ); - VVVV4P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] ); - - // Amplitude(s) for diagram number 905 - VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVV1_0( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 906 OF 1240 *** - - // Wavefunction(s) for diagram number 906 - // (none) - - // Amplitude(s) for diagram number 906 - VVV1_0( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVV1_0( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - VVV1_0( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 907 OF 1240 *** - - // Wavefunction(s) for diagram number 907 - // (none) - - // Amplitude(s) for diagram number 907 - VVVV1_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 908 OF 1240 *** - - // Wavefunction(s) for diagram number 908 - // (none) - - // Amplitude(s) for diagram number 908 - VVV1_0( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 909 OF 1240 *** - - // Wavefunction(s) for diagram number 909 - // (none) - - // Amplitude(s) for diagram number 909 - VVV1_0( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 910 OF 1240 *** - - // Wavefunction(s) for diagram number 910 - // (none) - - // Amplitude(s) for diagram number 910 - VVV1_0( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 911 OF 1240 *** - - // Wavefunction(s) for diagram number 911 - // (none) - - // Amplitude(s) for diagram number 911 - VVV1_0( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 912 OF 1240 *** - - // Wavefunction(s) for diagram number 912 - // (none) - - // Amplitude(s) for diagram number 912 - FFV1_0( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - - // *** DIAGRAM 913 OF 1240 *** - - // Wavefunction(s) for diagram number 913 - // (none) - - // Amplitude(s) for diagram number 913 - FFV1_0( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 914 OF 1240 *** - - // Wavefunction(s) for diagram number 914 - // (none) - - // Amplitude(s) for diagram number 914 - FFV1_0( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - - // *** DIAGRAM 915 OF 1240 *** - - // Wavefunction(s) for diagram number 915 - // (none) - - // Amplitude(s) for diagram number 915 - FFV1_0( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 916 OF 1240 *** - - // Wavefunction(s) for diagram number 916 - // (none) - - // Amplitude(s) for diagram number 916 - VVV1_0( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 917 OF 1240 *** - - // Wavefunction(s) for diagram number 917 - // (none) - - // Amplitude(s) for diagram number 917 - FFV1_0( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 918 OF 1240 *** - - // Wavefunction(s) for diagram number 918 - // (none) - - // Amplitude(s) for diagram number 918 - FFV1_0( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - - // *** DIAGRAM 919 OF 1240 *** - - // Wavefunction(s) for diagram number 919 - // (none) - - // Amplitude(s) for diagram number 919 - FFV1_0( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 920 OF 1240 *** - - // Wavefunction(s) for diagram number 920 - // (none) - - // Amplitude(s) for diagram number 920 - FFV1_0( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 921 OF 1240 *** - - // Wavefunction(s) for diagram number 921 - // (none) - - // Amplitude(s) for diagram number 921 - VVV1_0( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 922 OF 1240 *** - - // Wavefunction(s) for diagram number 922 - // (none) - - // Amplitude(s) for diagram number 922 - FFV1_0( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 923 OF 1240 *** - - // Wavefunction(s) for diagram number 923 - // (none) - - // Amplitude(s) for diagram number 923 - FFV1_0( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 924 OF 1240 *** - - // Wavefunction(s) for diagram number 924 - // (none) - - // Amplitude(s) for diagram number 924 - FFV1_0( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - - // *** DIAGRAM 925 OF 1240 *** - - // Wavefunction(s) for diagram number 925 - // (none) - - // Amplitude(s) for diagram number 925 - FFV1_0( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 926 OF 1240 *** - - // Wavefunction(s) for diagram number 926 - // (none) - - // Amplitude(s) for diagram number 926 - VVV1_0( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 927 OF 1240 *** - - // Wavefunction(s) for diagram number 927 - // (none) - - // Amplitude(s) for diagram number 927 - FFV1_0( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 928 OF 1240 *** - - // Wavefunction(s) for diagram number 928 - // (none) - - // Amplitude(s) for diagram number 928 - FFV1_0( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 929 OF 1240 *** - - // Wavefunction(s) for diagram number 929 - // (none) - - // Amplitude(s) for diagram number 929 - FFV1_0( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 930 OF 1240 *** - - // Wavefunction(s) for diagram number 930 - // (none) - - // Amplitude(s) for diagram number 930 - FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 931 OF 1240 *** - - // Wavefunction(s) for diagram number 931 - // (none) - - // Amplitude(s) for diagram number 931 - VVV1_0( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 932 OF 1240 *** - - // Wavefunction(s) for diagram number 932 - // (none) - - // Amplitude(s) for diagram number 932 - FFV1_0( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - - // *** DIAGRAM 933 OF 1240 *** - - // Wavefunction(s) for diagram number 933 - // (none) - - // Amplitude(s) for diagram number 933 - FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 934 OF 1240 *** - - // Wavefunction(s) for diagram number 934 - // (none) - - // Amplitude(s) for diagram number 934 - FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 935 OF 1240 *** - - // Wavefunction(s) for diagram number 935 - // (none) - - // Amplitude(s) for diagram number 935 - FFV1_0( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 936 OF 1240 *** - - // Wavefunction(s) for diagram number 936 - // (none) - - // Amplitude(s) for diagram number 936 - VVV1_0( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 937 OF 1240 *** - - // Wavefunction(s) for diagram number 937 - // (none) - - // Amplitude(s) for diagram number 937 - FFV1_0( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 938 OF 1240 *** - - // Wavefunction(s) for diagram number 938 - // (none) - - // Amplitude(s) for diagram number 938 - VVV1_0( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 939 OF 1240 *** - - // Wavefunction(s) for diagram number 939 - // (none) - - // Amplitude(s) for diagram number 939 - FFV1_0( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - - // *** DIAGRAM 940 OF 1240 *** - - // Wavefunction(s) for diagram number 940 - // (none) - - // Amplitude(s) for diagram number 940 - FFV1_0( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 941 OF 1240 *** - - // Wavefunction(s) for diagram number 941 - // (none) - - // Amplitude(s) for diagram number 941 - FFV1_0( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[38] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - - // *** DIAGRAM 942 OF 1240 *** - - // Wavefunction(s) for diagram number 942 - // (none) - - // Amplitude(s) for diagram number 942 - FFV1_0( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 943 OF 1240 *** - - // Wavefunction(s) for diagram number 943 - // (none) - - // Amplitude(s) for diagram number 943 - VVV1_0( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 944 OF 1240 *** - - // Wavefunction(s) for diagram number 944 - // (none) - - // Amplitude(s) for diagram number 944 - FFV1_0( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 945 OF 1240 *** - - // Wavefunction(s) for diagram number 945 - // (none) - - // Amplitude(s) for diagram number 945 - FFV1_0( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 946 OF 1240 *** - - // Wavefunction(s) for diagram number 946 - // (none) - - // Amplitude(s) for diagram number 946 - FFV1_0( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 947 OF 1240 *** - - // Wavefunction(s) for diagram number 947 - // (none) - - // Amplitude(s) for diagram number 947 - FFV1_0( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 948 OF 1240 *** - - // Wavefunction(s) for diagram number 948 - // (none) - - // Amplitude(s) for diagram number 948 - FFV1_0( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 949 OF 1240 *** - - // Wavefunction(s) for diagram number 949 - // (none) - - // Amplitude(s) for diagram number 949 - FFV1_0( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - - // *** DIAGRAM 950 OF 1240 *** - - // Wavefunction(s) for diagram number 950 - // (none) - - // Amplitude(s) for diagram number 950 - FFV1_0( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 951 OF 1240 *** - - // Wavefunction(s) for diagram number 951 - VVV1P0_1( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] ); - - // Amplitude(s) for diagram number 951 - VVV1_0( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 952 OF 1240 *** - - // Wavefunction(s) for diagram number 952 - // (none) - - // Amplitude(s) for diagram number 952 - VVV1_0( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 953 OF 1240 *** - - // Wavefunction(s) for diagram number 953 - // (none) - - // Amplitude(s) for diagram number 953 - VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 954 OF 1240 *** - - // Wavefunction(s) for diagram number 954 - // (none) - - // Amplitude(s) for diagram number 954 - VVV1_0( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - - // *** DIAGRAM 955 OF 1240 *** - - // Wavefunction(s) for diagram number 955 - // (none) - - // Amplitude(s) for diagram number 955 - VVV1_0( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 956 OF 1240 *** - - // Wavefunction(s) for diagram number 956 - // (none) - - // Amplitude(s) for diagram number 956 - VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 957 OF 1240 *** - - // Wavefunction(s) for diagram number 957 - // (none) - - // Amplitude(s) for diagram number 957 - VVV1_0( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - - // *** DIAGRAM 958 OF 1240 *** - - // Wavefunction(s) for diagram number 958 - // (none) - - // Amplitude(s) for diagram number 958 - VVV1_0( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - - // *** DIAGRAM 959 OF 1240 *** - - // Wavefunction(s) for diagram number 959 - VVVV1P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] ); - VVVV3P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] ); - VVVV4P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 959 - VVV1_0( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 960 OF 1240 *** - - // Wavefunction(s) for diagram number 960 - VVVV1P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] ); - VVVV3P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] ); - VVVV4P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] ); - - // Amplitude(s) for diagram number 960 - VVV1_0( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 961 OF 1240 *** - - // Wavefunction(s) for diagram number 961 - // (none) - - // Amplitude(s) for diagram number 961 - VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVV1_0( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - VVV1_0( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 962 OF 1240 *** - - // Wavefunction(s) for diagram number 962 - // (none) - - // Amplitude(s) for diagram number 962 - VVV1_0( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVV1_0( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - VVV1_0( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - - // *** DIAGRAM 963 OF 1240 *** - - // Wavefunction(s) for diagram number 963 - // (none) - - // Amplitude(s) for diagram number 963 - VVVV1_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 964 OF 1240 *** - - // Wavefunction(s) for diagram number 964 - // (none) - - // Amplitude(s) for diagram number 964 - VVV1_0( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 965 OF 1240 *** - - // Wavefunction(s) for diagram number 965 - // (none) - - // Amplitude(s) for diagram number 965 - VVV1_0( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 966 OF 1240 *** - - // Wavefunction(s) for diagram number 966 - // (none) - - // Amplitude(s) for diagram number 966 - VVV1_0( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 967 OF 1240 *** - - // Wavefunction(s) for diagram number 967 - // (none) - - // Amplitude(s) for diagram number 967 - VVV1_0( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 968 OF 1240 *** - - // Wavefunction(s) for diagram number 968 - // (none) - - // Amplitude(s) for diagram number 968 - FFV1_0( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - - // *** DIAGRAM 969 OF 1240 *** - - // Wavefunction(s) for diagram number 969 - // (none) - - // Amplitude(s) for diagram number 969 - FFV1_0( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 970 OF 1240 *** - - // Wavefunction(s) for diagram number 970 - // (none) - - // Amplitude(s) for diagram number 970 - FFV1_0( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - - // *** DIAGRAM 971 OF 1240 *** - - // Wavefunction(s) for diagram number 971 - // (none) - - // Amplitude(s) for diagram number 971 - FFV1_0( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 972 OF 1240 *** - - // Wavefunction(s) for diagram number 972 - // (none) - - // Amplitude(s) for diagram number 972 - VVV1_0( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 973 OF 1240 *** - - // Wavefunction(s) for diagram number 973 - // (none) - - // Amplitude(s) for diagram number 973 - FFV1_0( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 974 OF 1240 *** - - // Wavefunction(s) for diagram number 974 - // (none) - - // Amplitude(s) for diagram number 974 - FFV1_0( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - - // *** DIAGRAM 975 OF 1240 *** - - // Wavefunction(s) for diagram number 975 - // (none) - - // Amplitude(s) for diagram number 975 - FFV1_0( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 976 OF 1240 *** - - // Wavefunction(s) for diagram number 976 - // (none) - - // Amplitude(s) for diagram number 976 - FFV1_0( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 977 OF 1240 *** - - // Wavefunction(s) for diagram number 977 - // (none) - - // Amplitude(s) for diagram number 977 - VVV1_0( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 978 OF 1240 *** - - // Wavefunction(s) for diagram number 978 - // (none) - - // Amplitude(s) for diagram number 978 - FFV1_0( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - - // *** DIAGRAM 979 OF 1240 *** - - // Wavefunction(s) for diagram number 979 - // (none) - - // Amplitude(s) for diagram number 979 - FFV1_0( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 980 OF 1240 *** - - // Wavefunction(s) for diagram number 980 - // (none) - - // Amplitude(s) for diagram number 980 - FFV1_0( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - - // *** DIAGRAM 981 OF 1240 *** - - // Wavefunction(s) for diagram number 981 - // (none) - - // Amplitude(s) for diagram number 981 - FFV1_0( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 982 OF 1240 *** - - // Wavefunction(s) for diagram number 982 - // (none) - - // Amplitude(s) for diagram number 982 - VVV1_0( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 983 OF 1240 *** - - // Wavefunction(s) for diagram number 983 - // (none) - - // Amplitude(s) for diagram number 983 - FFV1_0( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 984 OF 1240 *** - - // Wavefunction(s) for diagram number 984 - // (none) - - // Amplitude(s) for diagram number 984 - FFV1_0( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - - // *** DIAGRAM 985 OF 1240 *** - - // Wavefunction(s) for diagram number 985 - // (none) - - // Amplitude(s) for diagram number 985 - FFV1_0( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 986 OF 1240 *** - - // Wavefunction(s) for diagram number 986 - // (none) - - // Amplitude(s) for diagram number 986 - FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 987 OF 1240 *** - - // Wavefunction(s) for diagram number 987 - // (none) - - // Amplitude(s) for diagram number 987 - VVV1_0( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 988 OF 1240 *** - - // Wavefunction(s) for diagram number 988 - // (none) - - // Amplitude(s) for diagram number 988 - FFV1_0( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 989 OF 1240 *** - - // Wavefunction(s) for diagram number 989 - // (none) - - // Amplitude(s) for diagram number 989 - FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 990 OF 1240 *** - - // Wavefunction(s) for diagram number 990 - // (none) - - // Amplitude(s) for diagram number 990 - FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[46] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - - // *** DIAGRAM 991 OF 1240 *** - - // Wavefunction(s) for diagram number 991 - // (none) - - // Amplitude(s) for diagram number 991 - FFV1_0( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 992 OF 1240 *** - - // Wavefunction(s) for diagram number 992 - // (none) - - // Amplitude(s) for diagram number 992 - VVV1_0( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 993 OF 1240 *** - - // Wavefunction(s) for diagram number 993 - // (none) - - // Amplitude(s) for diagram number 993 - FFV1_0( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 994 OF 1240 *** - - // Wavefunction(s) for diagram number 994 - // (none) - - // Amplitude(s) for diagram number 994 - VVV1_0( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 995 OF 1240 *** - - // Wavefunction(s) for diagram number 995 - // (none) - - // Amplitude(s) for diagram number 995 - FFV1_0( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - - // *** DIAGRAM 996 OF 1240 *** - - // Wavefunction(s) for diagram number 996 - // (none) - - // Amplitude(s) for diagram number 996 - FFV1_0( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 997 OF 1240 *** - - // Wavefunction(s) for diagram number 997 - // (none) - - // Amplitude(s) for diagram number 997 - FFV1_0( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[44] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - - // *** DIAGRAM 998 OF 1240 *** - - // Wavefunction(s) for diagram number 998 - // (none) - - // Amplitude(s) for diagram number 998 - FFV1_0( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 999 OF 1240 *** - - // Wavefunction(s) for diagram number 999 - // (none) - - // Amplitude(s) for diagram number 999 - VVV1_0( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1000 OF 1240 *** - - // Wavefunction(s) for diagram number 1000 - // (none) - - // Amplitude(s) for diagram number 1000 - FFV1_0( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1001 OF 1240 *** - - // Wavefunction(s) for diagram number 1001 - // (none) - - // Amplitude(s) for diagram number 1001 - FFV1_0( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1002 OF 1240 *** - - // Wavefunction(s) for diagram number 1002 - // (none) - - // Amplitude(s) for diagram number 1002 - FFV1_0( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1003 OF 1240 *** - - // Wavefunction(s) for diagram number 1003 - // (none) - - // Amplitude(s) for diagram number 1003 - FFV1_0( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1004 OF 1240 *** - - // Wavefunction(s) for diagram number 1004 - // (none) - - // Amplitude(s) for diagram number 1004 - FFV1_0( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1005 OF 1240 *** - - // Wavefunction(s) for diagram number 1005 - // (none) - - // Amplitude(s) for diagram number 1005 - FFV1_0( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 1006 OF 1240 *** - - // Wavefunction(s) for diagram number 1006 - // (none) - - // Amplitude(s) for diagram number 1006 - FFV1_0( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - - // *** DIAGRAM 1007 OF 1240 *** - - // Wavefunction(s) for diagram number 1007 - // (none) - - // Amplitude(s) for diagram number 1007 - VVV1_0( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1008 OF 1240 *** - - // Wavefunction(s) for diagram number 1008 - // (none) - - // Amplitude(s) for diagram number 1008 - VVV1_0( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1009 OF 1240 *** - - // Wavefunction(s) for diagram number 1009 - // (none) - - // Amplitude(s) for diagram number 1009 - VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1010 OF 1240 *** - - // Wavefunction(s) for diagram number 1010 - // (none) - - // Amplitude(s) for diagram number 1010 - VVV1_0( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1011 OF 1240 *** - - // Wavefunction(s) for diagram number 1011 - // (none) - - // Amplitude(s) for diagram number 1011 - VVV1_0( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1012 OF 1240 *** - - // Wavefunction(s) for diagram number 1012 - // (none) - - // Amplitude(s) for diagram number 1012 - VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 1013 OF 1240 *** - - // Wavefunction(s) for diagram number 1013 - // (none) - - // Amplitude(s) for diagram number 1013 - VVV1_0( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1014 OF 1240 *** - - // Wavefunction(s) for diagram number 1014 - // (none) - - // Amplitude(s) for diagram number 1014 - VVV1_0( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1015 OF 1240 *** - - // Wavefunction(s) for diagram number 1015 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] ); - - // Amplitude(s) for diagram number 1015 - VVV1_0( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVV1_0( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVV1_0( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1016 OF 1240 *** - - // Wavefunction(s) for diagram number 1016 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 1016 - VVV1_0( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1017 OF 1240 *** - - // Wavefunction(s) for diagram number 1017 - // (none) - - // Amplitude(s) for diagram number 1017 - VVV1_0( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1018 OF 1240 *** - - // Wavefunction(s) for diagram number 1018 - // (none) - - // Amplitude(s) for diagram number 1018 - VVV1_0( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1019 OF 1240 *** - - // Wavefunction(s) for diagram number 1019 - // (none) - - // Amplitude(s) for diagram number 1019 - VVV1_0( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 1020 OF 1240 *** - - // Wavefunction(s) for diagram number 1020 - // (none) - - // Amplitude(s) for diagram number 1020 - VVV1_0( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 1021 OF 1240 *** - - // Wavefunction(s) for diagram number 1021 - // (none) - - // Amplitude(s) for diagram number 1021 - VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 1022 OF 1240 *** - - // Wavefunction(s) for diagram number 1022 - // (none) - - // Amplitude(s) for diagram number 1022 - VVV1_0( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 1023 OF 1240 *** - - // Wavefunction(s) for diagram number 1023 - // (none) - - // Amplitude(s) for diagram number 1023 - VVV1_0( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - - // *** DIAGRAM 1024 OF 1240 *** - - // Wavefunction(s) for diagram number 1024 - // (none) - - // Amplitude(s) for diagram number 1024 - VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 1025 OF 1240 *** - - // Wavefunction(s) for diagram number 1025 - // (none) - - // Amplitude(s) for diagram number 1025 - VVV1_0( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - - // *** DIAGRAM 1026 OF 1240 *** - - // Wavefunction(s) for diagram number 1026 - // (none) - - // Amplitude(s) for diagram number 1026 - VVV1_0( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 1027 OF 1240 *** - - // Wavefunction(s) for diagram number 1027 - // (none) - - // Amplitude(s) for diagram number 1027 - VVV1_0( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVV1_0( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVV1_0( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - - // *** DIAGRAM 1028 OF 1240 *** - - // Wavefunction(s) for diagram number 1028 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 1028 - VVV1_0( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1029 OF 1240 *** - - // Wavefunction(s) for diagram number 1029 - // (none) - - // Amplitude(s) for diagram number 1029 - VVV1_0( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - - // *** DIAGRAM 1030 OF 1240 *** - - // Wavefunction(s) for diagram number 1030 - // (none) - - // Amplitude(s) for diagram number 1030 - VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 1031 OF 1240 *** - - // Wavefunction(s) for diagram number 1031 - // (none) - - // Amplitude(s) for diagram number 1031 - VVV1_0( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 1032 OF 1240 *** - - // Wavefunction(s) for diagram number 1032 - // (none) - - // Amplitude(s) for diagram number 1032 - VVV1_0( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - - // *** DIAGRAM 1033 OF 1240 *** - - // Wavefunction(s) for diagram number 1033 - // (none) - - // Amplitude(s) for diagram number 1033 - VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1034 OF 1240 *** - - // Wavefunction(s) for diagram number 1034 - // (none) - - // Amplitude(s) for diagram number 1034 - VVV1_0( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 1035 OF 1240 *** - - // Wavefunction(s) for diagram number 1035 - // (none) - - // Amplitude(s) for diagram number 1035 - VVV1_0( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - - // *** DIAGRAM 1036 OF 1240 *** - - // Wavefunction(s) for diagram number 1036 - // (none) - - // Amplitude(s) for diagram number 1036 - VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 1037 OF 1240 *** - - // Wavefunction(s) for diagram number 1037 - // (none) - - // Amplitude(s) for diagram number 1037 - VVV1_0( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1038 OF 1240 *** - - // Wavefunction(s) for diagram number 1038 - // (none) - - // Amplitude(s) for diagram number 1038 - VVV1_0( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 1039 OF 1240 *** - - // Wavefunction(s) for diagram number 1039 - // (none) - - // Amplitude(s) for diagram number 1039 - VVV1_0( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVV1_0( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVV1_0( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1040 OF 1240 *** - - // Wavefunction(s) for diagram number 1040 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 1040 - VVV1_0( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1041 OF 1240 *** - - // Wavefunction(s) for diagram number 1041 - // (none) - - // Amplitude(s) for diagram number 1041 - VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - - // *** DIAGRAM 1042 OF 1240 *** - - // Wavefunction(s) for diagram number 1042 - // (none) - - // Amplitude(s) for diagram number 1042 - VVV1_0( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1043 OF 1240 *** - - // Wavefunction(s) for diagram number 1043 - // (none) - - // Amplitude(s) for diagram number 1043 - VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1044 OF 1240 *** - - // Wavefunction(s) for diagram number 1044 - // (none) - - // Amplitude(s) for diagram number 1044 - VVV1_0( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1045 OF 1240 *** - - // Wavefunction(s) for diagram number 1045 - // (none) - - // Amplitude(s) for diagram number 1045 - VVV1_0( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1046 OF 1240 *** - - // Wavefunction(s) for diagram number 1046 - // (none) - - // Amplitude(s) for diagram number 1046 - FFV1_0( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] -= amp_sv[0]; - - // *** DIAGRAM 1047 OF 1240 *** - - // Wavefunction(s) for diagram number 1047 - // (none) - - // Amplitude(s) for diagram number 1047 - FFV1_0( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] -= amp_sv[0]; - - // *** DIAGRAM 1048 OF 1240 *** - - // Wavefunction(s) for diagram number 1048 - // (none) - - // Amplitude(s) for diagram number 1048 - FFV1_0( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[58] -= amp_sv[0]; - - // *** DIAGRAM 1049 OF 1240 *** - - // Wavefunction(s) for diagram number 1049 - // (none) - - // Amplitude(s) for diagram number 1049 - FFV1_0( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[68] -= amp_sv[0]; - - // *** DIAGRAM 1050 OF 1240 *** - - // Wavefunction(s) for diagram number 1050 - // (none) - - // Amplitude(s) for diagram number 1050 - FFV1_0( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[55] -= amp_sv[0]; - - // *** DIAGRAM 1051 OF 1240 *** - - // Wavefunction(s) for diagram number 1051 - // (none) - - // Amplitude(s) for diagram number 1051 - FFV1_0( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] -= amp_sv[0]; - - // *** DIAGRAM 1052 OF 1240 *** - - // Wavefunction(s) for diagram number 1052 - // (none) - - // Amplitude(s) for diagram number 1052 - FFV1_0( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] -= amp_sv[0]; - - // *** DIAGRAM 1053 OF 1240 *** - - // Wavefunction(s) for diagram number 1053 - // (none) - - // Amplitude(s) for diagram number 1053 - FFV1_0( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= amp_sv[0]; - - // *** DIAGRAM 1054 OF 1240 *** - - // Wavefunction(s) for diagram number 1054 - // (none) - - // Amplitude(s) for diagram number 1054 - FFV1_0( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[56] -= amp_sv[0]; - - // *** DIAGRAM 1055 OF 1240 *** - - // Wavefunction(s) for diagram number 1055 - // (none) - - // Amplitude(s) for diagram number 1055 - FFV1_0( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[62] -= amp_sv[0]; - - // *** DIAGRAM 1056 OF 1240 *** - - // Wavefunction(s) for diagram number 1056 - // (none) - - // Amplitude(s) for diagram number 1056 - FFV1_0( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[54] -= amp_sv[0]; - - // *** DIAGRAM 1057 OF 1240 *** - - // Wavefunction(s) for diagram number 1057 - // (none) - - // Amplitude(s) for diagram number 1057 - FFV1_0( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] -= amp_sv[0]; - - // *** DIAGRAM 1058 OF 1240 *** - - // Wavefunction(s) for diagram number 1058 - // (none) - - // Amplitude(s) for diagram number 1058 - FFV1_0( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - - // *** DIAGRAM 1059 OF 1240 *** - - // Wavefunction(s) for diagram number 1059 - // (none) - - // Amplitude(s) for diagram number 1059 - FFV1_0( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1060 OF 1240 *** - - // Wavefunction(s) for diagram number 1060 - // (none) - - // Amplitude(s) for diagram number 1060 - FFV1_0( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - - // *** DIAGRAM 1061 OF 1240 *** - - // Wavefunction(s) for diagram number 1061 - // (none) - - // Amplitude(s) for diagram number 1061 - VVV1_0( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1062 OF 1240 *** - - // Wavefunction(s) for diagram number 1062 - // (none) - - // Amplitude(s) for diagram number 1062 - FFV1_0( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1063 OF 1240 *** - - // Wavefunction(s) for diagram number 1063 - // (none) - - // Amplitude(s) for diagram number 1063 - VVV1_0( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1064 OF 1240 *** - - // Wavefunction(s) for diagram number 1064 - // (none) - - // Amplitude(s) for diagram number 1064 - FFV1_0( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1065 OF 1240 *** - - // Wavefunction(s) for diagram number 1065 - // (none) - - // Amplitude(s) for diagram number 1065 - FFV1_0( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] -= amp_sv[0]; - - // *** DIAGRAM 1066 OF 1240 *** - - // Wavefunction(s) for diagram number 1066 - // (none) - - // Amplitude(s) for diagram number 1066 - FFV1_0( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] -= amp_sv[0]; - - // *** DIAGRAM 1067 OF 1240 *** - - // Wavefunction(s) for diagram number 1067 - // (none) - - // Amplitude(s) for diagram number 1067 - FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[82] -= amp_sv[0]; - - // *** DIAGRAM 1068 OF 1240 *** - - // Wavefunction(s) for diagram number 1068 - // (none) - - // Amplitude(s) for diagram number 1068 - FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[92] -= amp_sv[0]; - - // *** DIAGRAM 1069 OF 1240 *** - - // Wavefunction(s) for diagram number 1069 - // (none) - - // Amplitude(s) for diagram number 1069 - FFV1_0( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[79] -= amp_sv[0]; - - // *** DIAGRAM 1070 OF 1240 *** - - // Wavefunction(s) for diagram number 1070 - // (none) - - // Amplitude(s) for diagram number 1070 - FFV1_0( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] -= amp_sv[0]; - - // *** DIAGRAM 1071 OF 1240 *** - - // Wavefunction(s) for diagram number 1071 - // (none) - - // Amplitude(s) for diagram number 1071 - FFV1_0( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] -= amp_sv[0]; - - // *** DIAGRAM 1072 OF 1240 *** - - // Wavefunction(s) for diagram number 1072 - // (none) - - // Amplitude(s) for diagram number 1072 - FFV1_0( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= amp_sv[0]; - - // *** DIAGRAM 1073 OF 1240 *** - - // Wavefunction(s) for diagram number 1073 - // (none) - - // Amplitude(s) for diagram number 1073 - FFV1_0( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[80] -= amp_sv[0]; - - // *** DIAGRAM 1074 OF 1240 *** - - // Wavefunction(s) for diagram number 1074 - // (none) - - // Amplitude(s) for diagram number 1074 - FFV1_0( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[86] -= amp_sv[0]; - - // *** DIAGRAM 1075 OF 1240 *** - - // Wavefunction(s) for diagram number 1075 - // (none) - - // Amplitude(s) for diagram number 1075 - FFV1_0( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[78] -= amp_sv[0]; - - // *** DIAGRAM 1076 OF 1240 *** - - // Wavefunction(s) for diagram number 1076 - // (none) - - // Amplitude(s) for diagram number 1076 - FFV1_0( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[84] -= amp_sv[0]; - - // *** DIAGRAM 1077 OF 1240 *** - - // Wavefunction(s) for diagram number 1077 - // (none) - - // Amplitude(s) for diagram number 1077 - FFV1_0( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - - // *** DIAGRAM 1078 OF 1240 *** - - // Wavefunction(s) for diagram number 1078 - // (none) - - // Amplitude(s) for diagram number 1078 - FFV1_0( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1079 OF 1240 *** - - // Wavefunction(s) for diagram number 1079 - // (none) - - // Amplitude(s) for diagram number 1079 - FFV1_0( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - - // *** DIAGRAM 1080 OF 1240 *** - - // Wavefunction(s) for diagram number 1080 - // (none) - - // Amplitude(s) for diagram number 1080 - VVV1_0( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1081 OF 1240 *** - - // Wavefunction(s) for diagram number 1081 - // (none) - - // Amplitude(s) for diagram number 1081 - FFV1_0( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1082 OF 1240 *** - - // Wavefunction(s) for diagram number 1082 - // (none) - - // Amplitude(s) for diagram number 1082 - VVV1_0( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1083 OF 1240 *** - - // Wavefunction(s) for diagram number 1083 - // (none) - - // Amplitude(s) for diagram number 1083 - FFV1_0( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1084 OF 1240 *** - - // Wavefunction(s) for diagram number 1084 - // (none) - - // Amplitude(s) for diagram number 1084 - FFV1_0( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] -= amp_sv[0]; - - // *** DIAGRAM 1085 OF 1240 *** - - // Wavefunction(s) for diagram number 1085 - // (none) - - // Amplitude(s) for diagram number 1085 - FFV1_0( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] -= amp_sv[0]; - - // *** DIAGRAM 1086 OF 1240 *** - - // Wavefunction(s) for diagram number 1086 - // (none) - - // Amplitude(s) for diagram number 1086 - FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[106] -= amp_sv[0]; - - // *** DIAGRAM 1087 OF 1240 *** - - // Wavefunction(s) for diagram number 1087 - // (none) - - // Amplitude(s) for diagram number 1087 - FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 1088 OF 1240 *** - - // Wavefunction(s) for diagram number 1088 - // (none) - - // Amplitude(s) for diagram number 1088 - FFV1_0( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 1089 OF 1240 *** - - // Wavefunction(s) for diagram number 1089 - // (none) - - // Amplitude(s) for diagram number 1089 - FFV1_0( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 1090 OF 1240 *** - - // Wavefunction(s) for diagram number 1090 - // (none) - - // Amplitude(s) for diagram number 1090 - FFV1_0( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] -= amp_sv[0]; - - // *** DIAGRAM 1091 OF 1240 *** - - // Wavefunction(s) for diagram number 1091 - // (none) - - // Amplitude(s) for diagram number 1091 - FFV1_0( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= amp_sv[0]; - - // *** DIAGRAM 1092 OF 1240 *** - - // Wavefunction(s) for diagram number 1092 - // (none) - - // Amplitude(s) for diagram number 1092 - FFV1_0( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[104] -= amp_sv[0]; - - // *** DIAGRAM 1093 OF 1240 *** - - // Wavefunction(s) for diagram number 1093 - // (none) - - // Amplitude(s) for diagram number 1093 - FFV1_0( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 1094 OF 1240 *** - - // Wavefunction(s) for diagram number 1094 - // (none) - - // Amplitude(s) for diagram number 1094 - FFV1_0( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 1095 OF 1240 *** - - // Wavefunction(s) for diagram number 1095 - // (none) - - // Amplitude(s) for diagram number 1095 - FFV1_0( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 1096 OF 1240 *** - - // Wavefunction(s) for diagram number 1096 - // (none) - - // Amplitude(s) for diagram number 1096 - FFV1_0( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - - // *** DIAGRAM 1097 OF 1240 *** - - // Wavefunction(s) for diagram number 1097 - // (none) - - // Amplitude(s) for diagram number 1097 - FFV1_0( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1098 OF 1240 *** - - // Wavefunction(s) for diagram number 1098 - // (none) - - // Amplitude(s) for diagram number 1098 - FFV1_0( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 1099 OF 1240 *** - - // Wavefunction(s) for diagram number 1099 - // (none) - - // Amplitude(s) for diagram number 1099 - VVV1_0( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1100 OF 1240 *** - - // Wavefunction(s) for diagram number 1100 - // (none) - - // Amplitude(s) for diagram number 1100 - FFV1_0( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1101 OF 1240 *** - - // Wavefunction(s) for diagram number 1101 - // (none) - - // Amplitude(s) for diagram number 1101 - VVV1_0( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1102 OF 1240 *** - - // Wavefunction(s) for diagram number 1102 - // (none) - - // Amplitude(s) for diagram number 1102 - FFV1_0( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1103 OF 1240 *** - - // Wavefunction(s) for diagram number 1103 - // (none) - - // Amplitude(s) for diagram number 1103 - FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1104 OF 1240 *** - - // Wavefunction(s) for diagram number 1104 - // (none) - - // Amplitude(s) for diagram number 1104 - FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1105 OF 1240 *** - - // Wavefunction(s) for diagram number 1105 - // (none) - - // Amplitude(s) for diagram number 1105 - FFV1_0( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - - // *** DIAGRAM 1106 OF 1240 *** - - // Wavefunction(s) for diagram number 1106 - // (none) - - // Amplitude(s) for diagram number 1106 - VVV1_0( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1107 OF 1240 *** - - // Wavefunction(s) for diagram number 1107 - // (none) - - // Amplitude(s) for diagram number 1107 - FFV1_0( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1108 OF 1240 *** - - // Wavefunction(s) for diagram number 1108 - // (none) - - // Amplitude(s) for diagram number 1108 - VVV1_0( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1109 OF 1240 *** - - // Wavefunction(s) for diagram number 1109 - // (none) - - // Amplitude(s) for diagram number 1109 - FFV1_0( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1110 OF 1240 *** - - // Wavefunction(s) for diagram number 1110 - // (none) - - // Amplitude(s) for diagram number 1110 - FFV1_0( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1111 OF 1240 *** - - // Wavefunction(s) for diagram number 1111 - // (none) - - // Amplitude(s) for diagram number 1111 - FFV1_0( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1112 OF 1240 *** - - // Wavefunction(s) for diagram number 1112 - // (none) - - // Amplitude(s) for diagram number 1112 - FFV1_0( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - - // *** DIAGRAM 1113 OF 1240 *** - - // Wavefunction(s) for diagram number 1113 - // (none) - - // Amplitude(s) for diagram number 1113 - VVV1_0( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1114 OF 1240 *** - - // Wavefunction(s) for diagram number 1114 - // (none) - - // Amplitude(s) for diagram number 1114 - FFV1_0( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1115 OF 1240 *** - - // Wavefunction(s) for diagram number 1115 - // (none) - - // Amplitude(s) for diagram number 1115 - VVV1_0( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1116 OF 1240 *** - - // Wavefunction(s) for diagram number 1116 - // (none) - - // Amplitude(s) for diagram number 1116 - FFV1_0( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1117 OF 1240 *** - - // Wavefunction(s) for diagram number 1117 - // (none) - - // Amplitude(s) for diagram number 1117 - FFV1_0( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - - // *** DIAGRAM 1118 OF 1240 *** - - // Wavefunction(s) for diagram number 1118 - // (none) - - // Amplitude(s) for diagram number 1118 - FFV1_0( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1119 OF 1240 *** - - // Wavefunction(s) for diagram number 1119 - // (none) - - // Amplitude(s) for diagram number 1119 - FFV1_0( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - - // *** DIAGRAM 1120 OF 1240 *** - - // Wavefunction(s) for diagram number 1120 - // (none) - - // Amplitude(s) for diagram number 1120 - VVV1_0( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1121 OF 1240 *** - - // Wavefunction(s) for diagram number 1121 - // (none) - - // Amplitude(s) for diagram number 1121 - FFV1_0( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1122 OF 1240 *** - - // Wavefunction(s) for diagram number 1122 - // (none) - - // Amplitude(s) for diagram number 1122 - VVV1_0( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1123 OF 1240 *** - - // Wavefunction(s) for diagram number 1123 - // (none) - - // Amplitude(s) for diagram number 1123 - FFV1_0( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1124 OF 1240 *** - - // Wavefunction(s) for diagram number 1124 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] ); - - // Amplitude(s) for diagram number 1124 - VVVV1_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVVV1_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV3_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV4_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVVV1_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV3_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1125 OF 1240 *** - - // Wavefunction(s) for diagram number 1125 - VVV1P0_1( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] ); - VVV1P0_1( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); - VVV1P0_1( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] ); - - // Amplitude(s) for diagram number 1125 - VVV1_0( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1126 OF 1240 *** - - // Wavefunction(s) for diagram number 1126 - VVV1P0_1( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] ); - VVV1P0_1( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] ); - VVV1P0_1( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 1126 - VVV1_0( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1127 OF 1240 *** - - // Wavefunction(s) for diagram number 1127 - // (none) - - // Amplitude(s) for diagram number 1127 - VVV1_0( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[1] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1128 OF 1240 *** - - // Wavefunction(s) for diagram number 1128 - FFV1_2( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_2( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); - FFV1_2( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); - - // Amplitude(s) for diagram number 1128 - FFV1_0( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - FFV1_0( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[91] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[93] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - FFV1_0( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - - // *** DIAGRAM 1129 OF 1240 *** - - // Wavefunction(s) for diagram number 1129 - // (none) - - // Amplitude(s) for diagram number 1129 - FFV1_0( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1130 OF 1240 *** - - // Wavefunction(s) for diagram number 1130 - // (none) - - // Amplitude(s) for diagram number 1130 - FFV1_0( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[74] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - - // *** DIAGRAM 1131 OF 1240 *** - - // Wavefunction(s) for diagram number 1131 - // (none) - - // Amplitude(s) for diagram number 1131 - FFV1_0( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - FFV1_0( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[115] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - FFV1_0( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1132 OF 1240 *** - - // Wavefunction(s) for diagram number 1132 - // (none) - - // Amplitude(s) for diagram number 1132 - FFV1_0( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1133 OF 1240 *** - - // Wavefunction(s) for diagram number 1133 - // (none) - - // Amplitude(s) for diagram number 1133 - FFV1_0( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[98] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 1134 OF 1240 *** - - // Wavefunction(s) for diagram number 1134 - FFV1_1( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - FFV1_1( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - FFV1_1( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); - - // Amplitude(s) for diagram number 1134 - FFV1_0( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - - // *** DIAGRAM 1135 OF 1240 *** - - // Wavefunction(s) for diagram number 1135 - // (none) - - // Amplitude(s) for diagram number 1135 - FFV1_0( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1136 OF 1240 *** - - // Wavefunction(s) for diagram number 1136 - // (none) - - // Amplitude(s) for diagram number 1136 - FFV1_0( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - - // *** DIAGRAM 1137 OF 1240 *** - - // Wavefunction(s) for diagram number 1137 - // (none) - - // Amplitude(s) for diagram number 1137 - FFV1_0( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1138 OF 1240 *** - - // Wavefunction(s) for diagram number 1138 - // (none) - - // Amplitude(s) for diagram number 1138 - FFV1_0( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1139 OF 1240 *** - - // Wavefunction(s) for diagram number 1139 - // (none) - - // Amplitude(s) for diagram number 1139 - FFV1_0( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1140 OF 1240 *** - - // Wavefunction(s) for diagram number 1140 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 1140 - VVVV1_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVVV3_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVVV4_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVVV1_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVVV3_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV4_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV1_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - VVVV3_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV4_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1141 OF 1240 *** - - // Wavefunction(s) for diagram number 1141 - VVV1P0_1( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] ); - VVV1P0_1( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] ); - VVV1P0_1( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 1141 - VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1142 OF 1240 *** - - // Wavefunction(s) for diagram number 1142 - VVV1P0_1( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] ); - VVV1P0_1( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] ); - VVV1P0_1( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] ); - - // Amplitude(s) for diagram number 1142 - VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1143 OF 1240 *** - - // Wavefunction(s) for diagram number 1143 - // (none) - - // Amplitude(s) for diagram number 1143 - VVV1_0( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - VVV1_0( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - VVV1_0( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 1144 OF 1240 *** - - // Wavefunction(s) for diagram number 1144 - FFV1_2( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); - FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] ); - FFV1_2( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - - // Amplitude(s) for diagram number 1144 - FFV1_0( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - FFV1_0( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[67] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[69] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - FFV1_0( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - - // *** DIAGRAM 1145 OF 1240 *** - - // Wavefunction(s) for diagram number 1145 - // (none) - - // Amplitude(s) for diagram number 1145 - FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1146 OF 1240 *** - - // Wavefunction(s) for diagram number 1146 - // (none) - - // Amplitude(s) for diagram number 1146 - FFV1_0( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[50] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - - // *** DIAGRAM 1147 OF 1240 *** - - // Wavefunction(s) for diagram number 1147 - // (none) - - // Amplitude(s) for diagram number 1147 - FFV1_0( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[113] += amp_sv[0]; - FFV1_0( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[109] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - FFV1_0( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[108] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - - // *** DIAGRAM 1148 OF 1240 *** - - // Wavefunction(s) for diagram number 1148 - // (none) - - // Amplitude(s) for diagram number 1148 - FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1149 OF 1240 *** - - // Wavefunction(s) for diagram number 1149 - // (none) - - // Amplitude(s) for diagram number 1149 - FFV1_0( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[100] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** DIAGRAM 1150 OF 1240 *** - - // Wavefunction(s) for diagram number 1150 - FFV1_1( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); - FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); - FFV1_1( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] ); - - // Amplitude(s) for diagram number 1150 - FFV1_0( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - - // *** DIAGRAM 1151 OF 1240 *** - - // Wavefunction(s) for diagram number 1151 - // (none) - - // Amplitude(s) for diagram number 1151 - FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1152 OF 1240 *** - - // Wavefunction(s) for diagram number 1152 - // (none) - - // Amplitude(s) for diagram number 1152 - FFV1_0( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - - // *** DIAGRAM 1153 OF 1240 *** - - // Wavefunction(s) for diagram number 1153 - // (none) - - // Amplitude(s) for diagram number 1153 - FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1154 OF 1240 *** - - // Wavefunction(s) for diagram number 1154 - // (none) - - // Amplitude(s) for diagram number 1154 - FFV1_0( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1155 OF 1240 *** - - // Wavefunction(s) for diagram number 1155 - // (none) - - // Amplitude(s) for diagram number 1155 - FFV1_0( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1156 OF 1240 *** - - // Wavefunction(s) for diagram number 1156 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); - - // Amplitude(s) for diagram number 1156 - VVVV1_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVVV3_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - VVVV4_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - VVVV1_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - VVVV3_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - VVVV4_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - VVVV1_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - VVVV3_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - VVVV4_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 1157 OF 1240 *** - - // Wavefunction(s) for diagram number 1157 - VVV1P0_1( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] ); - VVV1P0_1( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] ); - VVV1P0_1( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] ); - - // Amplitude(s) for diagram number 1157 - VVV1_0( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 1158 OF 1240 *** - - // Wavefunction(s) for diagram number 1158 - VVV1P0_1( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] ); - VVV1P0_1( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] ); - VVV1P0_1( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] ); - - // Amplitude(s) for diagram number 1158 - VVV1_0( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 1159 OF 1240 *** - - // Wavefunction(s) for diagram number 1159 - // (none) - - // Amplitude(s) for diagram number 1159 - VVV1_0( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - VVV1_0( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - VVV1_0( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - - // *** DIAGRAM 1160 OF 1240 *** - - // Wavefunction(s) for diagram number 1160 - FFV1_2( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_2( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - FFV1_2( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); - - // Amplitude(s) for diagram number 1160 - FFV1_0( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[65] += amp_sv[0]; - FFV1_0( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[61] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[63] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - FFV1_0( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - - // *** DIAGRAM 1161 OF 1240 *** - - // Wavefunction(s) for diagram number 1161 - // (none) - - // Amplitude(s) for diagram number 1161 - FFV1_0( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1162 OF 1240 *** - - // Wavefunction(s) for diagram number 1162 - // (none) - - // Amplitude(s) for diagram number 1162 - FFV1_0( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[52] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - - // *** DIAGRAM 1163 OF 1240 *** - - // Wavefunction(s) for diagram number 1163 - // (none) - - // Amplitude(s) for diagram number 1163 - FFV1_0( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[89] += amp_sv[0]; - FFV1_0( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[85] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - FFV1_0( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[84] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - - // *** DIAGRAM 1164 OF 1240 *** - - // Wavefunction(s) for diagram number 1164 - // (none) - - // Amplitude(s) for diagram number 1164 - FFV1_0( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1165 OF 1240 *** - - // Wavefunction(s) for diagram number 1165 - // (none) - - // Amplitude(s) for diagram number 1165 - FFV1_0( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[76] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - - // *** DIAGRAM 1166 OF 1240 *** - - // Wavefunction(s) for diagram number 1166 - FFV1_1( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - FFV1_1( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - FFV1_1( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); - - // Amplitude(s) for diagram number 1166 - FFV1_0( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - - // *** DIAGRAM 1167 OF 1240 *** - - // Wavefunction(s) for diagram number 1167 - // (none) - - // Amplitude(s) for diagram number 1167 - FFV1_0( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1168 OF 1240 *** - - // Wavefunction(s) for diagram number 1168 - // (none) - - // Amplitude(s) for diagram number 1168 - FFV1_0( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - - // *** DIAGRAM 1169 OF 1240 *** - - // Wavefunction(s) for diagram number 1169 - // (none) - - // Amplitude(s) for diagram number 1169 - FFV1_0( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1170 OF 1240 *** - - // Wavefunction(s) for diagram number 1170 - // (none) - - // Amplitude(s) for diagram number 1170 - FFV1_0( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1171 OF 1240 *** - - // Wavefunction(s) for diagram number 1171 - // (none) - - // Amplitude(s) for diagram number 1171 - FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1172 OF 1240 *** - - // Wavefunction(s) for diagram number 1172 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] ); - FFV1_2( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_2( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); - FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); - - // Amplitude(s) for diagram number 1172 - FFV1_0( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - FFV1_0( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[43] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - - // *** DIAGRAM 1173 OF 1240 *** - - // Wavefunction(s) for diagram number 1173 - VVV1P0_1( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] ); - VVV1P0_1( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] ); - VVV1P0_1( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] ); - - // Amplitude(s) for diagram number 1173 - FFV1_0( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1174 OF 1240 *** - - // Wavefunction(s) for diagram number 1174 - // (none) - - // Amplitude(s) for diagram number 1174 - FFV1_0( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - - // *** DIAGRAM 1175 OF 1240 *** - - // Wavefunction(s) for diagram number 1175 - FFV1_1( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); - FFV1_1( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); - FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 1175 - FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - - // *** DIAGRAM 1176 OF 1240 *** - - // Wavefunction(s) for diagram number 1176 - // (none) - - // Amplitude(s) for diagram number 1176 - FFV1_0( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1177 OF 1240 *** - - // Wavefunction(s) for diagram number 1177 - // (none) - - // Amplitude(s) for diagram number 1177 - FFV1_0( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1178 OF 1240 *** - - // Wavefunction(s) for diagram number 1178 - // (none) - - // Amplitude(s) for diagram number 1178 - FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1179 OF 1240 *** - - // Wavefunction(s) for diagram number 1179 - // (none) - - // Amplitude(s) for diagram number 1179 - FFV1_0( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1180 OF 1240 *** - - // Wavefunction(s) for diagram number 1180 - // (none) - - // Amplitude(s) for diagram number 1180 - VVV1_0( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVV1_0( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[45] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[60] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[103] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVV1_0( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - - // *** DIAGRAM 1181 OF 1240 *** - - // Wavefunction(s) for diagram number 1181 - // (none) - - // Amplitude(s) for diagram number 1181 - VVVV1_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVVV3_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - VVVV4_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV1_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVVV3_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - VVVV4_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV1_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVVV3_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1182 OF 1240 *** - - // Wavefunction(s) for diagram number 1182 - VVV1P0_1( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] ); - VVV1P0_1( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] ); - VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 1182 - VVV1_0( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[26] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[112] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - - // *** DIAGRAM 1183 OF 1240 *** - - // Wavefunction(s) for diagram number 1183 - // (none) - - // Amplitude(s) for diagram number 1183 - VVV1_0( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[24] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[47] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[118] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[15] += amp_sv[0]; - jamp_sv[26] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[112] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[24] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - jamp_sv[118] -= amp_sv[0]; - - // *** DIAGRAM 1184 OF 1240 *** - - // Wavefunction(s) for diagram number 1184 - // (none) - - // Amplitude(s) for diagram number 1184 - FFV1_0( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1185 OF 1240 *** - - // Wavefunction(s) for diagram number 1185 - // (none) - - // Amplitude(s) for diagram number 1185 - FFV1_0( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[102] += amp_sv[0]; - jamp_sv[103] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[107] += amp_sv[0]; - FFV1_0( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[103] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[102] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - - // *** DIAGRAM 1186 OF 1240 *** - - // Wavefunction(s) for diagram number 1186 - // (none) - - // Amplitude(s) for diagram number 1186 - FFV1_0( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1187 OF 1240 *** - - // Wavefunction(s) for diagram number 1187 - // (none) - - // Amplitude(s) for diagram number 1187 - FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[84] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[14] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[60] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - FFV1_0( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[84] -= amp_sv[0]; - - // *** DIAGRAM 1188 OF 1240 *** - - // Wavefunction(s) for diagram number 1188 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] ); - FFV1_2( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); - FFV1_2( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); - FFV1_2( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] ); - - // Amplitude(s) for diagram number 1188 - FFV1_0( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - FFV1_0( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[37] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - FFV1_0( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - - // *** DIAGRAM 1189 OF 1240 *** - - // Wavefunction(s) for diagram number 1189 - VVV1P0_1( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] ); - VVV1P0_1( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] ); - VVV1P0_1( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] ); - - // Amplitude(s) for diagram number 1189 - FFV1_0( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1190 OF 1240 *** - - // Wavefunction(s) for diagram number 1190 - // (none) - - // Amplitude(s) for diagram number 1190 - FFV1_0( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - - // *** DIAGRAM 1191 OF 1240 *** - - // Wavefunction(s) for diagram number 1191 - FFV1_1( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] ); - FFV1_1( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); - FFV1_1( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 1191 - FFV1_0( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - - // *** DIAGRAM 1192 OF 1240 *** - - // Wavefunction(s) for diagram number 1192 - // (none) - - // Amplitude(s) for diagram number 1192 - FFV1_0( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1193 OF 1240 *** - - // Wavefunction(s) for diagram number 1193 - // (none) - - // Amplitude(s) for diagram number 1193 - FFV1_0( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - - // *** DIAGRAM 1194 OF 1240 *** - - // Wavefunction(s) for diagram number 1194 - // (none) - - // Amplitude(s) for diagram number 1194 - FFV1_0( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1195 OF 1240 *** - - // Wavefunction(s) for diagram number 1195 - // (none) - - // Amplitude(s) for diagram number 1195 - FFV1_0( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1196 OF 1240 *** - - // Wavefunction(s) for diagram number 1196 - // (none) - - // Amplitude(s) for diagram number 1196 - VVV1_0( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVV1_0( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[39] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[66] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[79] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - VVV1_0( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - - // *** DIAGRAM 1197 OF 1240 *** - - // Wavefunction(s) for diagram number 1197 - // (none) - - // Amplitude(s) for diagram number 1197 - VVVV1_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - VVVV3_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - VVVV4_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - VVVV1_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVVV3_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVVV4_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVVV1_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - VVVV3_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - VVVV4_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 1198 OF 1240 *** - - // Wavefunction(s) for diagram number 1198 - VVV1P0_1( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] ); - VVV1P0_1( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); - VVV1P0_1( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] ); - - // Amplitude(s) for diagram number 1198 - VVV1_0( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[28] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[88] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[42] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[85] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 1199 OF 1240 *** - - // Wavefunction(s) for diagram number 1199 - // (none) - - // Amplitude(s) for diagram number 1199 - VVV1_0( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[25] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[36] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[94] += amp_sv[0]; - jamp_sv[109] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[21] += amp_sv[0]; - jamp_sv[28] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[88] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[25] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[36] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[42] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[85] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[94] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[109] += amp_sv[0]; - - // *** DIAGRAM 1200 OF 1240 *** - - // Wavefunction(s) for diagram number 1200 - // (none) - - // Amplitude(s) for diagram number 1200 - FFV1_0( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1201 OF 1240 *** - - // Wavefunction(s) for diagram number 1201 - // (none) - - // Amplitude(s) for diagram number 1201 - FFV1_0( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[78] += amp_sv[0]; - jamp_sv[79] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[83] += amp_sv[0]; - FFV1_0( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[79] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - FFV1_0( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[78] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - - // *** DIAGRAM 1202 OF 1240 *** - - // Wavefunction(s) for diagram number 1202 - // (none) - - // Amplitude(s) for diagram number 1202 - FFV1_0( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1203 OF 1240 *** - - // Wavefunction(s) for diagram number 1203 - // (none) - - // Amplitude(s) for diagram number 1203 - FFV1_0( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[108] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[20] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[66] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - FFV1_0( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[108] -= amp_sv[0]; - - // *** DIAGRAM 1204 OF 1240 *** - - // Wavefunction(s) for diagram number 1204 - VVVV1P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] ); - VVVV3P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] ); - VVVV4P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] ); - FFV1_2( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); - FFV1_2( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] ); - - // Amplitude(s) for diagram number 1204 - FFV1_0( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - FFV1_0( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[31] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - FFV1_0( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - - // *** DIAGRAM 1205 OF 1240 *** - - // Wavefunction(s) for diagram number 1205 - VVV1P0_1( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] ); - VVV1P0_1( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] ); - VVV1P0_1( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 1205 - FFV1_0( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1206 OF 1240 *** - - // Wavefunction(s) for diagram number 1206 - // (none) - - // Amplitude(s) for diagram number 1206 - FFV1_0( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[29] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[27] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - - // *** DIAGRAM 1207 OF 1240 *** - - // Wavefunction(s) for diagram number 1207 - FFV1_1( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); - FFV1_1( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); - - // Amplitude(s) for diagram number 1207 - FFV1_0( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 1208 OF 1240 *** - - // Wavefunction(s) for diagram number 1208 - // (none) - - // Amplitude(s) for diagram number 1208 - FFV1_0( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1209 OF 1240 *** - - // Wavefunction(s) for diagram number 1209 - // (none) - - // Amplitude(s) for diagram number 1209 - FFV1_0( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - FFV1_0( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - - // *** DIAGRAM 1210 OF 1240 *** - - // Wavefunction(s) for diagram number 1210 - // (none) - - // Amplitude(s) for diagram number 1210 - FFV1_0( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1211 OF 1240 *** - - // Wavefunction(s) for diagram number 1211 - // (none) - - // Amplitude(s) for diagram number 1211 - FFV1_0( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1212 OF 1240 *** - - // Wavefunction(s) for diagram number 1212 - // (none) - - // Amplitude(s) for diagram number 1212 - VVV1_0( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVV1_0( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[31] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[55] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[90] += amp_sv[0]; - jamp_sv[91] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVV1_0( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - - // *** DIAGRAM 1213 OF 1240 *** - - // Wavefunction(s) for diagram number 1213 - // (none) - - // Amplitude(s) for diagram number 1213 - VVVV1_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - VVVV3_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVVV4_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVVV1_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVVV3_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV4_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVVV1_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - VVVV3_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - VVVV4_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 1214 OF 1240 *** - - // Wavefunction(s) for diagram number 1214 - VVV1P0_1( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] ); - VVV1P0_1( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] ); - VVV1P0_1( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] ); - - // Amplitude(s) for diagram number 1214 - VVV1_0( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[29] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[64] += amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[37] -= amp_sv[0]; - jamp_sv[43] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[61] -= amp_sv[0]; - jamp_sv[67] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 1215 OF 1240 *** - - // Wavefunction(s) for diagram number 1215 - // (none) - - // Amplitude(s) for diagram number 1215 - VVV1_0( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[27] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[30] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[70] += amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[115] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[23] += amp_sv[0]; - jamp_sv[29] -= amp_sv[0]; - jamp_sv[31] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[64] -= amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[91] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - VVV1_0( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[17] += amp_sv[0]; - jamp_sv[27] -= amp_sv[0]; - jamp_sv[30] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[37] += amp_sv[0]; - jamp_sv[43] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[61] += amp_sv[0]; - jamp_sv[67] += amp_sv[0]; - jamp_sv[70] -= amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[115] += amp_sv[0]; - - // *** DIAGRAM 1216 OF 1240 *** - - // Wavefunction(s) for diagram number 1216 - // (none) - - // Amplitude(s) for diagram number 1216 - FFV1_0( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1217 OF 1240 *** - - // Wavefunction(s) for diagram number 1217 - // (none) - - // Amplitude(s) for diagram number 1217 - FFV1_0( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[54] += amp_sv[0]; - jamp_sv[55] -= amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[59] += amp_sv[0]; - FFV1_0( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[55] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - FFV1_0( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[54] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - - // *** DIAGRAM 1218 OF 1240 *** - - // Wavefunction(s) for diagram number 1218 - // (none) - - // Amplitude(s) for diagram number 1218 - FFV1_0( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1219 OF 1240 *** - - // Wavefunction(s) for diagram number 1219 - // (none) - - // Amplitude(s) for diagram number 1219 - FFV1_0( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[114] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[22] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[90] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - FFV1_0( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[16] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[114] -= amp_sv[0]; - - // *** DIAGRAM 1220 OF 1240 *** - - // Wavefunction(s) for diagram number 1220 - // (none) - - // Amplitude(s) for diagram number 1220 - VVVV1_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 1221 OF 1240 *** - - // Wavefunction(s) for diagram number 1221 - VVV1P0_1( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] ); - VVV1P0_1( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] ); - VVV1P0_1( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] ); - - // Amplitude(s) for diagram number 1221 - VVV1_0( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[32] -= amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - jamp_sv[96] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[38] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[62] += amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[97] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[99] += amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[32] += amp_sv[0]; - jamp_sv[56] -= amp_sv[0]; - jamp_sv[80] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - jamp_sv[96] += amp_sv[0]; - jamp_sv[98] -= amp_sv[0]; - jamp_sv[100] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1222 OF 1240 *** - - // Wavefunction(s) for diagram number 1222 - // (none) - - // Amplitude(s) for diagram number 1222 - VVV1_0( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - jamp_sv[33] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[87] -= amp_sv[0]; - jamp_sv[105] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[119] += amp_sv[0]; - VVV1_0( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[39] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[113] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVV1_0( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - jamp_sv[33] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - jamp_sv[119] -= amp_sv[0]; - - // *** DIAGRAM 1223 OF 1240 *** - - // Wavefunction(s) for diagram number 1223 - // (none) - - // Amplitude(s) for diagram number 1223 - FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1224 OF 1240 *** - - // Wavefunction(s) for diagram number 1224 - // (none) - - // Amplitude(s) for diagram number 1224 - FFV1_0( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] += amp_sv[0]; - jamp_sv[97] -= amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[101] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[97] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[99] -= amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[96] -= amp_sv[0]; - jamp_sv[98] += amp_sv[0]; - jamp_sv[100] += amp_sv[0]; - jamp_sv[101] -= amp_sv[0]; - - // *** DIAGRAM 1225 OF 1240 *** - - // Wavefunction(s) for diagram number 1225 - // (none) - - // Amplitude(s) for diagram number 1225 - FFV1_0( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1226 OF 1240 *** - - // Wavefunction(s) for diagram number 1226 - // (none) - - // Amplitude(s) for diagram number 1226 - FFV1_0( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] += amp_sv[0]; - jamp_sv[38] -= amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[86] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[38] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[62] -= amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - FFV1_0( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[32] -= amp_sv[0]; - jamp_sv[56] += amp_sv[0]; - jamp_sv[80] += amp_sv[0]; - jamp_sv[86] -= amp_sv[0]; - - // *** DIAGRAM 1227 OF 1240 *** - - // Wavefunction(s) for diagram number 1227 - // (none) - - // Amplitude(s) for diagram number 1227 - VVVV1_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 1228 OF 1240 *** - - // Wavefunction(s) for diagram number 1228 - VVV1P0_1( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] ); - VVV1P0_1( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] ); - VVV1P0_1( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] ); - - // Amplitude(s) for diagram number 1228 - VVV1_0( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[34] -= amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[72] -= amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[44] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[68] += amp_sv[0]; - jamp_sv[73] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[75] += amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[34] += amp_sv[0]; - jamp_sv[58] -= amp_sv[0]; - jamp_sv[72] += amp_sv[0]; - jamp_sv[74] -= amp_sv[0]; - jamp_sv[76] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[104] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - - // *** DIAGRAM 1229 OF 1240 *** - - // Wavefunction(s) for diagram number 1229 - // (none) - - // Amplitude(s) for diagram number 1229 - VVV1_0( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[35] -= amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[81] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[95] += amp_sv[0]; - jamp_sv[111] -= amp_sv[0]; - VVV1_0( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[45] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[89] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - VVV1_0( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[35] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[81] -= amp_sv[0]; - jamp_sv[87] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[95] -= amp_sv[0]; - jamp_sv[105] -= amp_sv[0]; - jamp_sv[111] += amp_sv[0]; - - // *** DIAGRAM 1230 OF 1240 *** - - // Wavefunction(s) for diagram number 1230 - // (none) - - // Amplitude(s) for diagram number 1230 - FFV1_0( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1231 OF 1240 *** - - // Wavefunction(s) for diagram number 1231 - // (none) - - // Amplitude(s) for diagram number 1231 - FFV1_0( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] += amp_sv[0]; - jamp_sv[73] -= amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[77] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[73] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[75] -= amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[72] -= amp_sv[0]; - jamp_sv[74] += amp_sv[0]; - jamp_sv[76] += amp_sv[0]; - jamp_sv[77] -= amp_sv[0]; - - // *** DIAGRAM 1232 OF 1240 *** - - // Wavefunction(s) for diagram number 1232 - // (none) - - // Amplitude(s) for diagram number 1232 - FFV1_0( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1233 OF 1240 *** - - // Wavefunction(s) for diagram number 1233 - // (none) - - // Amplitude(s) for diagram number 1233 - FFV1_0( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] += amp_sv[0]; - jamp_sv[44] -= amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[110] += amp_sv[0]; - FFV1_0( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[44] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[68] -= amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - FFV1_0( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[34] -= amp_sv[0]; - jamp_sv[58] += amp_sv[0]; - jamp_sv[104] += amp_sv[0]; - jamp_sv[110] -= amp_sv[0]; - - // *** DIAGRAM 1234 OF 1240 *** - - // Wavefunction(s) for diagram number 1234 - // (none) - - // Amplitude(s) for diagram number 1234 - VVVV1_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVVV1_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 1235 OF 1240 *** - - // Wavefunction(s) for diagram number 1235 - VVV1P0_1( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] ); - VVV1P0_1( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] ); - VVV1P0_1( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] ); - - // Amplitude(s) for diagram number 1235 - VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[40] -= amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[48] -= amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[46] += amp_sv[0]; - jamp_sv[49] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[51] += amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[92] += amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - VVV1_0( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[40] += amp_sv[0]; - jamp_sv[48] += amp_sv[0]; - jamp_sv[50] -= amp_sv[0]; - jamp_sv[52] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[82] -= amp_sv[0]; - jamp_sv[106] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - - // *** DIAGRAM 1236 OF 1240 *** - - // Wavefunction(s) for diagram number 1236 - // (none) - - // Amplitude(s) for diagram number 1236 - VVV1_0( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[41] -= amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[57] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[71] += amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[117] -= amp_sv[0]; - VVV1_0( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[47] += amp_sv[0]; - jamp_sv[59] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[65] -= amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[93] += amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - VVV1_0( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[41] += amp_sv[0]; - jamp_sv[57] -= amp_sv[0]; - jamp_sv[63] += amp_sv[0]; - jamp_sv[69] += amp_sv[0]; - jamp_sv[71] -= amp_sv[0]; - jamp_sv[83] -= amp_sv[0]; - jamp_sv[107] -= amp_sv[0]; - jamp_sv[117] += amp_sv[0]; - - // *** DIAGRAM 1237 OF 1240 *** - - // Wavefunction(s) for diagram number 1237 - // (none) - - // Amplitude(s) for diagram number 1237 - FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1238 OF 1240 *** - - // Wavefunction(s) for diagram number 1238 - // (none) - - // Amplitude(s) for diagram number 1238 - FFV1_0( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] += amp_sv[0]; - jamp_sv[49] -= amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[53] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[49] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[51] -= amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[48] -= amp_sv[0]; - jamp_sv[50] += amp_sv[0]; - jamp_sv[52] += amp_sv[0]; - jamp_sv[53] -= amp_sv[0]; - - // *** DIAGRAM 1239 OF 1240 *** - - // Wavefunction(s) for diagram number 1239 - // (none) - - // Amplitude(s) for diagram number 1239 - FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - FFV1_0( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 1240 OF 1240 *** - - // Wavefunction(s) for diagram number 1240 - // (none) - - // Amplitude(s) for diagram number 1240 - FFV1_0( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] += amp_sv[0]; - jamp_sv[46] -= amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[116] += amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[46] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[92] -= amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - FFV1_0( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[40] -= amp_sv[0]; - jamp_sv[82] += amp_sv[0]; - jamp_sv[106] += amp_sv[0]; - jamp_sv[116] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxggg()?) - - // The color denominators (initialize all array elements, with ncolor=120) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120] - - // The color matrix (initialize all array elements, with ncolor=120) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 }, - { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 }, - { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 }, - { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 }, - { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 }, - { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 }, - { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 }, - { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 }, - { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 }, - { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 }, - { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 }, - { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 }, - { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 }, - { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 }, - { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 }, - { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 }, - { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 }, - { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 }, - { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 }, - { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 }, - { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 }, - { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 }, - { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 }, - { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 }, - { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 }, - { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 }, - { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 }, - { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 }, - { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 }, - { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 }, - { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 }, - { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 }, - { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 }, - { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 }, - { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 }, - { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 }, - { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 }, - { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 }, - { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 }, - { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 }, - { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 }, - { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 }, - { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 }, - { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 }, - { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 }, - { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 }, - { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 }, - { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 }, - { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 }, - { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 }, - { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 }, - { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 }, - { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 }, - { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 }, - { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 }, - { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 }, - { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 }, - { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 }, - { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 }, - { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 }, - { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 }, - { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 }, - { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 }, - { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 }, - { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 }, - { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 }, - { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 }, - { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 }, - { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 }, - { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 }, - { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 }, - { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 }, - { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 }, - { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 }, - { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 }, - { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 }, - { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 }, - { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 }, - { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 }, - { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 }, - { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 }, - { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 }, - { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 }, - { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 }, - { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 }, - { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 }, - { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 }, - { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 }, - { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 }, - { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 }, - { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 }, - { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 }, - { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 }, - { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 }, - { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 }, - { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 }, - { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 }, - { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 }, - { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 }, - { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 }, - { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 }, - { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 }, - { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 }, - { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 }, - { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 }, - { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 }, - { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 }, - { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 }, - { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 }, - { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 }, - { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 }, - { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 }, - { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 }, - { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 }, - { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 }, - { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 }, - { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 }, - { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 }, - { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, - { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 1240 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram124, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram125, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram126, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram127, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram128, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram129, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram130, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram131, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram132, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram133, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram134, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram135, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram136, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram137, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram138, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram139, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram140, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram141, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram142, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram143, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram144, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram145, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram146, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram147, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram148, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram149, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram150, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram151, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram152, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram153, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram154, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram155, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram156, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram157, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram158, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram159, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram160, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram161, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram162, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram163, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram164, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram165, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram166, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram167, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram168, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram169, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram170, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram171, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram172, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram173, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram174, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram175, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram176, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram177, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram178, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram179, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram180, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram181, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram182, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram183, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram184, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram185, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram186, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram187, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram188, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram189, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram190, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram191, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram192, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram193, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram194, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram195, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram196, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram197, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram198, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram199, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram200, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram201, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram202, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram203, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram204, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram205, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram206, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram207, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram208, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram209, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram210, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram211, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram212, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram213, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram214, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram215, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram216, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram217, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram218, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram219, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram220, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram221, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram222, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram223, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram224, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram225, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram226, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram227, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram228, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram229, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram230, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram231, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram232, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram233, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram234, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram235, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram236, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram237, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram238, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram239, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram240, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram241, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram242, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram243, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram244, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram245, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram246, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram247, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram248, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram249, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram250, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram251, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram252, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram253, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram254, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram255, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram256, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram257, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram258, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram259, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram260, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram261, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram262, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram263, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram264, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram265, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram266, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram267, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram268, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram269, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram270, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram271, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram272, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram273, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram274, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram275, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram276, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram277, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram278, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram279, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram280, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram281, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram282, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram283, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram284, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram285, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram286, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram287, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram288, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram289, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram290, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram291, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram292, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram293, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram294, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram295, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram296, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram297, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram298, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram299, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram300, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram301, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram302, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram303, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram304, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram305, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram306, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram307, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram308, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram309, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram310, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram311, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram312, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram313, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram314, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram315, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram316, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram317, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram318, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram319, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram320, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram321, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram322, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram323, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram324, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram325, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram326, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram327, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram328, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram329, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram330, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram331, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram332, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram333, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram334, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram335, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram336, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram337, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram338, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram339, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram340, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram341, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram342, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram343, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram344, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram345, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram346, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram347, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram348, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram349, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram350, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram351, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram352, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram353, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram354, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram355, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram356, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram357, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram358, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram359, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram360, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram361, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram362, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram363, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram364, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram365, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram366, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram367, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram368, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram369, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram370, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram371, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram372, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram373, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram374, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram375, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram376, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram377, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram378, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram379, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram380, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram381, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram382, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram383, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram384, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram385, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram386, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram387, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram388, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram389, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram390, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram391, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram392, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram393, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram394, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram395, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram396, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram397, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram398, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram399, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram400, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram401, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram402, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram403, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram404, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram405, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram406, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram407, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram408, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram409, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram410, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram411, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram412, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram413, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram414, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram415, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram416, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram417, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram418, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram419, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram420, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram421, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram422, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram423, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram424, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram425, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram426, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram427, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram428, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram429, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram430, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram431, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram432, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram433, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram434, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram435, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram436, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram437, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram438, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram439, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram440, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram441, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram442, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram443, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram444, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram445, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram446, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram447, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram448, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram449, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram450, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram451, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram452, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram453, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram454, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram455, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram456, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram457, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram458, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram459, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram460, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram461, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram462, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram463, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram464, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram465, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram466, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram467, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram468, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram469, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram470, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram471, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram472, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram473, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram474, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram475, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram476, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram477, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram478, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram479, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram480, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram481, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram482, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram483, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram484, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram485, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram486, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram487, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram488, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram489, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram490, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram491, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram492, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram493, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram494, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram495, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram496, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram497, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram498, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram499, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram500, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram501, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram502, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram503, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram504, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram505, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram506, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram507, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram508, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram509, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram510, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram511, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram512, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram513, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram514, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram515, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram516, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram517, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram518, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram519, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram520, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram521, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram522, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram523, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram524, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram525, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram526, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram527, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram528, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram529, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram530, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram531, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram532, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram533, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram534, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram535, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram536, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram537, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram538, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram539, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram540, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram541, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram542, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram543, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram544, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram545, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram546, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram547, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram548, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram549, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram550, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram551, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram552, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram553, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram554, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram555, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram556, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram557, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram558, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram559, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram560, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram561, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram562, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram563, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram564, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram565, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram566, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram567, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram568, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram569, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram570, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram571, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram572, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram573, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram574, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram575, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram576, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram577, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram578, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram579, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram580, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram581, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram582, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram583, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram584, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram585, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram586, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram587, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram588, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram589, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram590, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram591, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram592, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram593, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram594, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram595, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram596, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram597, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram598, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram599, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram600, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram601, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram602, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram603, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram604, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram605, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram606, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram607, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram608, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram609, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram610, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram611, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram612, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram613, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram614, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram615, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram616, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram617, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram618, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram619, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram620, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram621, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram622, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram623, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram624, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram625, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram626, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram627, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram628, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram629, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram630, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram631, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram632, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram633, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram634, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram635, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram636, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram637, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram638, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram639, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram640, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram641, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram642, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram643, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram644, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram645, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram646, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram647, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram648, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram649, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram650, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram651, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram652, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram653, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram654, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram655, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram656, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram657, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram658, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram659, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram660, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram661, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram662, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram663, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram664, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram665, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram666, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram667, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram668, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram669, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram670, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram671, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram672, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram673, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram674, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram675, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram676, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram677, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram678, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram679, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram680, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram681, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram682, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram683, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram684, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram685, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram686, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram687, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram688, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram689, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram690, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram691, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram692, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram693, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram694, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram695, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram696, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram697, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram698, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram699, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram700, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram701, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram702, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram703, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram704, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram705, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram706, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram707, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram708, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram709, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram710, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram711, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram712, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram713, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram714, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram715, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram716, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram717, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram718, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram719, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram720, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram721, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram722, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram723, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram724, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram725, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram726, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram727, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram728, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram729, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram730, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram731, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram732, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram733, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram734, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram735, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram736, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram737, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram738, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram739, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram740, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram741, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram742, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram743, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram744, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram745, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram746, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram747, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram748, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram749, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram750, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram751, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram752, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram753, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram754, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram755, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram756, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram757, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram758, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram759, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram760, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram761, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram762, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram763, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram764, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram765, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram766, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram767, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram768, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram769, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram770, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram771, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram772, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram773, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram774, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram775, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram776, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram777, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram778, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram779, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram780, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram781, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram782, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram783, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram784, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram785, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram786, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram787, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram788, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram789, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram790, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram791, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram792, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram793, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram794, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram795, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram796, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram797, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram798, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram799, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram800, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram801, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram802, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram803, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram804, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram805, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram806, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram807, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram808, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram809, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram810, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram811, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram812, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram813, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram814, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram815, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram816, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram817, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram818, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram819, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram820, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram821, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram822, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram823, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram824, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram825, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram826, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram827, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram828, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram829, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram830, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram831, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram832, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram833, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram834, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram835, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram836, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram837, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram838, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram839, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram840, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram841, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram842, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram843, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram844, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram845, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram846, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram847, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram848, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram849, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram850, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram851, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram852, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram853, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram854, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram855, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram856, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram857, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram858, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram859, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram860, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram861, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram862, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram863, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram864, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram865, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram866, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram867, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram868, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram869, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram870, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram871, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram872, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram873, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram874, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram875, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram876, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram877, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram878, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram879, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram880, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram881, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram882, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram883, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram884, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram885, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram886, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram887, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram888, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram889, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram890, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram891, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram892, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram893, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram894, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram895, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram896, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram897, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram898, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram899, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram900, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram901, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram902, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram903, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram904, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram905, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram906, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram907, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram908, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram909, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram910, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram911, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram912, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram913, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram914, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram915, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram916, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram917, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram918, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram919, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram920, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram921, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram922, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram923, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram924, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram925, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram926, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram927, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram928, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram929, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram930, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram931, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram932, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram933, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram934, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram935, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram936, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram937, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram938, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram939, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram940, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram941, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram942, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram943, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram944, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram945, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram946, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram947, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram948, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram949, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram950, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram951, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram952, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram953, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram954, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram955, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram956, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram957, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram958, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram959, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram960, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram961, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram962, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram963, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram964, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram965, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram966, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram967, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram968, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram969, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram970, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram971, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram972, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram973, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram974, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram975, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram976, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram977, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram978, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram979, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram980, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram981, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram982, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram983, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram984, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram985, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram986, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram987, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram988, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram989, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram990, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram991, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram992, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram993, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram994, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram995, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram996, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram997, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram998, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram999, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1000, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1001, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1002, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1003, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1004, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1005, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1006, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1007, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1008, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1009, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1010, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1011, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1012, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1013, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1014, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1015, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1016, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1017, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1018, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1019, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1020, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1021, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1022, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1023, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1024, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1025, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1026, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1027, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1028, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1029, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1030, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1031, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1032, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1033, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1034, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1035, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1036, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1037, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1038, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1039, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1040, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1041, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1042, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1043, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1044, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1045, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1046, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1047, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1048, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1049, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1050, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1051, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1052, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1053, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1054, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1055, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1056, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1057, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1058, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1059, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1060, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1061, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1062, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1063, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1064, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1065, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1066, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1067, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1068, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1069, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1070, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1071, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1072, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1073, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1074, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1075, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1076, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1077, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1078, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1079, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1080, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1081, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1082, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1083, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1084, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1085, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1086, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1087, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1088, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1089, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1090, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1091, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1092, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1093, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1094, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1095, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1096, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1097, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1098, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1099, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1124, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1125, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1126, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1127, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1128, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1129, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1130, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1131, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1132, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1133, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1134, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1135, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1136, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1137, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1138, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1139, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1140, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1141, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1142, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1143, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1144, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1145, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1146, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1147, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1148, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1149, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1150, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1151, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1152, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1153, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1154, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1155, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1156, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1157, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1158, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1159, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1160, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1161, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1162, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1163, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1164, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1165, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1166, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1167, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1168, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1169, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1170, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1171, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1172, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1173, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1174, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1175, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1176, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1177, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1178, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1179, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1180, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1181, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1182, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1183, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1184, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1185, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1186, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1187, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1188, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1189, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1190, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1191, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1192, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1193, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1194, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1195, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1196, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1197, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1198, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1199, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1200, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1201, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1202, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1203, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1204, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1205, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1206, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1207, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1208, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1209, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1210, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1211, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1212, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1213, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1214, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1215, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1216, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1217, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1218, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1219, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1220, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1221, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1222, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1223, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1224, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1225, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1226, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1227, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1228, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1229, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1230, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1231, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1232, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1233, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1234, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1235, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1236, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1237, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1238, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1239, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram1240, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram124( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram125( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram126( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram127( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram128( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram129( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram130( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram131( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram132( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram133( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram134( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram135( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram136( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram137( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram138( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram139( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram140( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram141( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram142( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram143( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram144( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram145( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram146( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram147( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram148( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram149( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram150( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram151( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram152( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram153( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram154( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram155( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram156( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram157( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram158( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram159( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram160( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram161( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram162( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram163( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram164( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram165( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram166( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram167( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram168( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram169( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram170( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram171( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram172( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram173( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram174( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram175( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram176( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram177( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram178( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram179( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram180( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram181( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram182( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram183( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram184( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram185( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram186( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram187( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram188( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram189( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram190( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram191( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram192( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram193( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram194( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram195( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram196( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram197( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram198( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram199( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram200( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram201( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram202( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram203( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram204( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram205( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram206( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram207( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram208( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram209( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram210( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram211( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram212( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram213( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram214( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram215( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram216( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram217( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram218( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram219( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram220( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram221( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram222( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram223( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram224( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram225( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram226( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram227( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram228( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram229( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram230( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram231( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram232( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram233( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram234( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram235( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram236( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram237( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram238( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram239( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram240( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram241( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram242( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram243( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram244( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram245( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram246( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram247( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram248( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram249( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram250( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram251( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram252( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram253( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram254( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram255( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram256( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram257( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram258( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram259( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram260( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram261( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram262( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram263( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram264( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram265( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram266( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram267( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram268( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram269( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram270( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram271( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram272( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram273( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram274( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram275( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram276( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram277( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram278( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram279( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram280( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram281( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram282( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram283( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram284( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram285( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram286( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram287( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram288( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram289( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram290( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram291( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram292( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram293( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram294( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram295( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram296( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram297( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram298( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram299( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram300( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram301( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram302( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram303( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram304( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram305( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram306( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram307( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram308( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram309( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram310( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram311( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram312( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram313( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram314( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram315( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram316( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram317( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram318( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram319( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram320( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram321( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram322( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram323( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram324( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram325( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram326( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram327( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram328( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram329( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram330( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram331( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram332( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram333( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram334( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram335( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram336( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram337( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram338( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram339( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram340( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram341( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram342( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram343( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram344( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram345( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram346( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram347( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram348( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram349( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram350( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram351( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram352( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram353( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram354( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram355( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram356( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram357( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram358( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram359( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram360( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram361( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram362( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram363( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram364( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram365( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram366( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram367( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram368( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram369( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram370( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram371( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram372( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram373( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram374( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram375( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram376( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram377( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram378( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram379( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram380( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram381( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram382( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram383( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram384( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram385( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram386( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram387( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram388( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram389( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram390( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram391( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram392( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram393( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram394( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram395( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram396( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram397( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram398( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram399( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram400( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram401( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram402( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram403( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram404( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram405( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram406( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram407( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram408( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram409( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram410( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram411( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram412( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram413( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram414( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram415( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram416( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram417( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram418( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram419( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram420( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram421( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram422( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram423( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram424( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram425( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram426( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram427( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram428( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram429( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram430( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram431( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram432( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram433( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram434( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram435( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram436( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram437( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram438( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram439( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram440( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram441( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram442( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram443( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram444( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram445( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram446( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram447( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram448( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram449( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram450( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram451( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram452( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram453( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram454( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram455( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram456( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram457( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram458( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram459( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram460( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram461( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram462( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram463( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram464( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram465( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram466( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram467( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram468( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram469( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram470( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram471( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram472( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram473( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram474( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram475( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram476( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram477( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram478( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram479( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram480( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram481( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram482( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram483( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram484( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram485( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram486( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram487( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram488( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram489( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram490( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram491( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram492( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram493( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram494( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram495( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram496( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram497( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram498( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram499( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram500( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram501( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram502( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram503( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram504( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram505( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram506( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram507( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram508( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram509( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram510( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram511( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram512( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram513( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram514( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram515( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram516( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram517( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram518( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram519( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram520( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram521( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram522( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram523( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram524( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram525( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram526( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram527( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram528( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram529( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram530( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram531( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram532( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram533( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram534( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram535( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram536( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram537( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram538( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram539( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram540( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram541( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram542( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram543( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram544( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram545( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram546( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram547( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram548( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram549( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram550( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram551( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram552( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram553( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram554( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram555( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram556( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram557( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram558( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram559( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram560( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram561( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram562( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram563( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram564( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram565( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram566( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram567( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram568( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram569( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram570( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram571( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram572( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram573( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram574( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram575( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram576( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram577( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram578( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram579( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram580( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram581( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram582( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram583( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram584( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram585( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram586( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram587( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram588( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram589( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram590( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram591( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram592( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram593( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram594( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram595( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram596( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram597( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram598( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram599( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram600( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram601( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram602( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram603( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram604( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram605( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram606( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram607( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram608( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram609( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram610( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram611( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram612( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram613( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram614( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram615( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram616( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram617( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram618( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram619( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram620( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram621( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram622( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram623( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram624( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram625( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram626( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram627( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram628( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram629( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram630( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram631( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram632( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram633( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram634( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram635( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram636( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram637( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram638( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram639( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram640( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram641( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram642( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram643( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram644( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram645( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram646( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram647( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram648( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram649( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram650( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram651( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram652( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram653( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram654( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram655( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram656( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram657( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram658( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram659( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram660( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram661( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram662( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram663( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram664( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram665( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram666( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram667( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram668( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram669( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram670( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram671( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram672( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram673( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram674( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram675( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram676( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram677( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram678( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram679( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram680( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram681( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram682( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram683( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram684( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram685( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram686( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram687( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram688( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram689( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram690( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram691( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram692( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram693( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram694( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram695( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram696( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram697( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram698( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram699( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram700( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram701( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram702( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram703( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram704( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram705( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram706( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram707( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram708( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram709( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram710( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram711( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram712( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram713( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram714( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram715( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram716( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram717( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram718( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram719( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram720( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram721( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram722( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram723( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram724( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram725( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram726( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram727( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram728( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram729( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram730( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram731( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram732( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram733( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram734( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram735( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram736( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram737( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram738( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram739( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram740( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram741( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram742( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram743( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram744( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram745( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram746( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram747( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram748( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram749( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram750( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram751( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram752( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram753( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram754( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram755( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram756( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram757( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram758( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram759( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram760( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram761( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram762( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram763( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram764( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram765( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram766( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram767( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram768( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram769( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram770( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram771( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram772( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram773( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram774( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram775( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram776( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram777( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram778( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram779( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram780( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram781( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram782( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram783( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram784( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram785( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram786( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram787( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram788( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram789( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram790( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram791( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram792( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram793( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram794( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram795( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram796( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram797( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram798( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram799( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram800( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram801( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram802( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram803( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram804( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram805( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram806( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram807( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram808( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram809( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram810( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram811( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram812( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram813( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram814( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram815( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram816( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram817( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram818( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram819( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram820( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram821( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram822( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram823( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram824( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram825( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram826( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram827( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram828( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram829( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram830( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram831( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram832( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram833( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram834( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram835( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram836( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram837( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram838( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram839( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram840( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram841( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram842( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram843( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram844( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram845( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram846( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram847( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram848( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram849( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram850( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram851( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram852( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram853( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram854( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram855( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram856( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram857( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram858( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram859( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram860( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram861( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram862( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram863( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram864( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram865( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram866( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram867( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram868( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram869( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram870( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram871( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram872( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram873( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram874( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram875( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram876( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram877( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram878( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram879( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram880( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram881( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram882( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram883( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram884( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram885( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram886( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram887( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram888( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram889( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram890( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram891( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram892( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram893( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram894( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram895( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram896( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram897( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram898( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram899( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram900( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram901( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram902( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram903( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram904( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram905( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram906( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram907( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram908( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram909( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram910( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram911( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram912( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram913( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram914( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram915( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram916( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram917( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram918( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram919( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram920( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram921( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram922( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram923( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram924( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram925( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram926( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram927( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram928( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram929( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram930( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram931( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram932( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram933( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram934( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram935( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram936( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram937( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram938( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram939( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram940( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram941( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram942( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram943( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram944( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram945( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram946( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram947( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram948( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram949( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram950( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram951( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram952( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram953( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram954( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram955( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram956( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram957( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram958( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram959( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram960( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram961( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram962( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram963( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram964( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram965( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram966( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram967( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram968( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram969( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram970( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram971( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram972( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram973( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram974( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram975( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram976( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram977( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram978( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram979( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram980( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram981( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram982( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram983( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram984( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram985( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram986( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram987( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram988( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram989( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram990( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram991( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram992( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram993( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram994( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram995( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram996( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram997( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram998( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram999( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1000( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1001( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1002( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1003( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1004( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1005( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1006( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1007( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1008( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1009( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1010( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1011( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1012( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1013( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1014( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1015( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1016( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1017( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1018( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1019( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1020( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1021( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1022( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1023( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1024( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1025( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1026( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1027( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1028( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1029( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1030( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1031( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1032( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1033( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1034( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1035( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1036( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1037( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1038( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1039( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1040( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1041( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1042( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1043( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1044( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1045( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1046( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1047( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1048( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1049( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1050( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1051( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1052( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1053( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1054( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1055( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1056( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1057( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1058( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1059( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1060( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1061( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1062( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1063( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1064( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1065( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1066( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1067( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1068( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1069( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1070( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1071( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1072( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1073( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1074( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1075( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1076( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1077( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1078( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1079( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1080( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1081( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1082( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1083( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1084( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1085( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1086( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1087( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1088( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1089( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1090( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1091( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1092( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1093( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1094( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1095( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1096( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1097( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1098( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1099( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1100( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1101( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1102( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1103( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1104( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1105( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1106( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1107( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1108( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1109( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1110( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1111( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1112( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1113( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1114( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1115( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1116( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1117( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1118( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1119( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1120( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1121( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1122( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1123( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1124( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1125( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1126( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1127( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1128( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1129( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1130( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1131( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1132( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1133( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1134( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1135( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1136( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1137( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1138( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1139( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1140( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1141( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1142( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1143( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1144( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1145( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1146( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1147( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1148( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1149( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1150( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1151( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1152( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1153( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1154( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1155( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1156( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1157( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1158( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1159( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1160( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1161( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1162( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1163( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1164( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1165( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1166( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1167( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1168( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1169( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1170( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1171( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1172( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1173( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1174( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1175( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1176( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1177( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1178( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1179( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1180( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1181( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1182( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1183( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1184( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1185( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1186( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1187( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1188( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1189( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1190( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1191( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1192( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1193( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1194( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1195( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1196( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1197( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1198( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1199( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1200( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1201( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1202( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1203( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1204( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1205( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1206( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1207( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1208( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1209( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1210( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1211( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1212( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1213( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1214( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1215( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1216( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1217( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1218( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1219( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1220( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1221( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1222( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1223( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1224( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1225( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1226( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1227( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1228( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1229( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1230( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1231( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1232( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1233( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1234( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1235( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1236( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1237( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1238( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1239( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram1240( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -32273,7 +3036,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -32309,6 +3076,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -32352,6 +3123,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -32454,26 +3229,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -32481,25 +3256,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -32644,13 +3623,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -32662,17 +3635,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -32698,93 +3674,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -32826,7 +3772,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -32849,7 +3795,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -32858,25 +3804,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -32886,8 +3838,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -32903,11 +3857,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -33009,14 +3964,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 2eb1e066ff..6b99d481e4 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 128; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 1240; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 120; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 121; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 7; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 1890; //static const int ncomb = 128; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc new file mode 100644 index 0000000000..c027c38503 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc @@ -0,0 +1,501 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=120) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120] + + // The color matrix (initialize all array elements, with ncolor=120) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 }, + { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 }, + { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 }, + { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 }, + { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 }, + { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 }, + { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 }, + { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 }, + { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 }, + { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 }, + { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 }, + { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 }, + { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 }, + { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 }, + { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 }, + { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 }, + { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 }, + { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 }, + { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 }, + { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 }, + { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 }, + { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 }, + { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 }, + { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 }, + { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 }, + { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 }, + { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 }, + { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 }, + { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 }, + { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 }, + { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 }, + { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 }, + { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 }, + { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 }, + { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 }, + { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 }, + { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 }, + { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 }, + { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 }, + { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 }, + { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 }, + { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 }, + { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 }, + { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 }, + { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 }, + { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 }, + { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 }, + { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 }, + { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 }, + { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 }, + { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 }, + { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 }, + { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 }, + { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 }, + { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 }, + { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 }, + { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 }, + { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 }, + { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 }, + { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 }, + { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 }, + { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 }, + { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 }, + { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 }, + { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 }, + { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 }, + { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 }, + { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 }, + { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 }, + { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 }, + { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 }, + { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 }, + { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 }, + { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 }, + { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 }, + { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 }, + { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 }, + { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 }, + { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 }, + { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 }, + { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 }, + { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 }, + { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 }, + { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 }, + { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 }, + { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 }, + { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 }, + { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 }, + { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 }, + { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 }, + { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 }, + { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 }, + { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 }, + { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 }, + { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 }, + { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 }, + { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 }, + { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 }, + { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 }, + { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 }, + { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 }, + { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 }, + { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 }, + { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 }, + { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 }, + { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 }, + { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 }, + { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 }, + { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 }, + { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 }, + { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 }, + { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 }, + { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 }, + { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 }, + { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 }, + { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 }, + { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 }, + { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 }, + { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, + { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagrams.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagrams.h new file mode 100644 index 0000000000..a35fbac6a1 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagrams.h @@ -0,0 +1,51386 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 1240 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + vxxxxx( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + VVV1P0_1( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] ); + VVV1P0_1( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 1 + VVV1_0( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 1240 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 2 + VVV1_0( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 1240 *** + // Wavefunction(s) for diagram number 3 + // (none) + // Amplitude(s) for diagram number 3 + VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 1240 *** + // Wavefunction(s) for diagram number 4 + VVV1P0_1( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] ); + VVV1P0_1( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] ); + // Amplitude(s) for diagram number 4 + VVV1_0( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 1240 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 1240 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 1240 *** + // Wavefunction(s) for diagram number 7 + VVV1P0_1( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 7 + VVV1_0( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 1240 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + VVV1_0( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 1240 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 1240 *** + // Wavefunction(s) for diagram number 10 + VVVV1P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); + VVVV3P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] ); + VVVV4P0_1( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); + // Amplitude(s) for diagram number 10 + VVV1_0( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 1240 *** + // Wavefunction(s) for diagram number 11 + VVVV1P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] ); + VVVV3P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] ); + VVVV4P0_1( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] ); + // Amplitude(s) for diagram number 11 + VVV1_0( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 1240 *** + // Wavefunction(s) for diagram number 12 + VVVV1P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] ); + VVVV4P0_1( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 12 + VVV1_0( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 1240 *** + // Wavefunction(s) for diagram number 13 + VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 13 + VVVV1_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 1240 *** + // Wavefunction(s) for diagram number 14 + VVV1P0_1( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] ); + // Amplitude(s) for diagram number 14 + VVV1_0( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 1240 *** + // Wavefunction(s) for diagram number 15 + VVV1P0_1( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] ); + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 1240 *** + // Wavefunction(s) for diagram number 16 + // (none) + // Amplitude(s) for diagram number 16 + VVV1_0( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 1240 *** + // Wavefunction(s) for diagram number 17 + VVV1P0_1( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] ); + // Amplitude(s) for diagram number 17 + VVVV1_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + VVVV3_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVVV4_0( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 1240 *** + // Wavefunction(s) for diagram number 18 + // (none) + // Amplitude(s) for diagram number 18 + VVV1_0( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 1240 *** + // Wavefunction(s) for diagram number 19 + VVV1P0_1( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] ); + // Amplitude(s) for diagram number 19 + VVV1_0( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 1240 *** + // Wavefunction(s) for diagram number 20 + // (none) + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 1240 *** + // Wavefunction(s) for diagram number 21 + VVV1P0_1( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] ); + // Amplitude(s) for diagram number 21 + VVVV1_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 1240 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + VVV1_0( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 1240 *** + // Wavefunction(s) for diagram number 23 + // (none) + // Amplitude(s) for diagram number 23 + VVV1_0( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 1240 *** + // Wavefunction(s) for diagram number 24 + VVV1P0_1( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] ); + // Amplitude(s) for diagram number 24 + VVV1_0( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 1240 *** + // Wavefunction(s) for diagram number 25 + VVVV1P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] ); + VVVV3P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] ); + VVVV4P0_1( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] ); + // Amplitude(s) for diagram number 25 + VVV1_0( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVV1_0( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 1240 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] ); + FFV1_2( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); + FFV1_1( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 1240 *** + // Wavefunction(s) for diagram number 27 + FFV1_1( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] ); + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 1240 *** + // Wavefunction(s) for diagram number 28 + FFV1P0_3( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] ); + // Amplitude(s) for diagram number 28 + VVV1_0( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 1240 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 1240 *** + // Wavefunction(s) for diagram number 30 + // (none) + // Amplitude(s) for diagram number 30 + VVV1_0( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 1240 *** + // Wavefunction(s) for diagram number 31 + // (none) + // Amplitude(s) for diagram number 31 + FFV1_0( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 1240 *** + // Wavefunction(s) for diagram number 32 + // (none) + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 1240 *** + // Wavefunction(s) for diagram number 33 + FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] ); + FFV1_1( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] ); + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 1240 *** + // Wavefunction(s) for diagram number 34 + FFV1_2( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] ); + // Amplitude(s) for diagram number 34 + FFV1_0( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 1240 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + FFV1_0( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 1240 *** + // Wavefunction(s) for diagram number 36 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] ); + // Amplitude(s) for diagram number 36 + FFV1_0( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram37( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 37 OF 1240 *** + // Wavefunction(s) for diagram number 37 + FFV1_2( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] ); + // Amplitude(s) for diagram number 37 + FFV1_0( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram38( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 38 OF 1240 *** + // Wavefunction(s) for diagram number 38 + // (none) + // Amplitude(s) for diagram number 38 + FFV1_0( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram39( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 39 OF 1240 *** + // Wavefunction(s) for diagram number 39 + // (none) + // Amplitude(s) for diagram number 39 + FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram40( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 40 OF 1240 *** + // Wavefunction(s) for diagram number 40 + // (none) + // Amplitude(s) for diagram number 40 + FFV1_0( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram41( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 41 OF 1240 *** + // Wavefunction(s) for diagram number 41 + // (none) + // Amplitude(s) for diagram number 41 + FFV1_0( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram42( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 42 OF 1240 *** + // Wavefunction(s) for diagram number 42 + FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] ); + FFV1_1( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] ); + // Amplitude(s) for diagram number 42 + FFV1_0( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram43( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 43 OF 1240 *** + // Wavefunction(s) for diagram number 43 + FFV1_1( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] ); + // Amplitude(s) for diagram number 43 + FFV1_0( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram44( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 44 OF 1240 *** + // Wavefunction(s) for diagram number 44 + FFV1P0_3( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] ); + // Amplitude(s) for diagram number 44 + VVV1_0( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram45( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 45 OF 1240 *** + // Wavefunction(s) for diagram number 45 + // (none) + // Amplitude(s) for diagram number 45 + FFV1_0( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram46( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 46 OF 1240 *** + // Wavefunction(s) for diagram number 46 + // (none) + // Amplitude(s) for diagram number 46 + VVV1_0( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram47( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 47 OF 1240 *** + // Wavefunction(s) for diagram number 47 + // (none) + // Amplitude(s) for diagram number 47 + FFV1_0( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram48( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 48 OF 1240 *** + // Wavefunction(s) for diagram number 48 + // (none) + // Amplitude(s) for diagram number 48 + FFV1_0( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram49( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 49 OF 1240 *** + // Wavefunction(s) for diagram number 49 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] ); + FFV1_1( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] ); + // Amplitude(s) for diagram number 49 + FFV1_0( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram50( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 50 OF 1240 *** + // Wavefunction(s) for diagram number 50 + FFV1_2( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] ); + // Amplitude(s) for diagram number 50 + FFV1_0( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram51( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 51 OF 1240 *** + // Wavefunction(s) for diagram number 51 + // (none) + // Amplitude(s) for diagram number 51 + FFV1_0( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram52( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 52 OF 1240 *** + // Wavefunction(s) for diagram number 52 + // (none) + // Amplitude(s) for diagram number 52 + FFV1_0( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram53( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 53 OF 1240 *** + // Wavefunction(s) for diagram number 53 + // (none) + // Amplitude(s) for diagram number 53 + FFV1_0( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram54( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 54 OF 1240 *** + // Wavefunction(s) for diagram number 54 + // (none) + // Amplitude(s) for diagram number 54 + FFV1_0( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram55( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 55 OF 1240 *** + // Wavefunction(s) for diagram number 55 + // (none) + // Amplitude(s) for diagram number 55 + FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram56( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 56 OF 1240 *** + // Wavefunction(s) for diagram number 56 + // (none) + // Amplitude(s) for diagram number 56 + FFV1_0( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram57( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 57 OF 1240 *** + // Wavefunction(s) for diagram number 57 + // (none) + // Amplitude(s) for diagram number 57 + FFV1_0( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram58( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 58 OF 1240 *** + // Wavefunction(s) for diagram number 58 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] ); + FFV1_1( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] ); + // Amplitude(s) for diagram number 58 + FFV1_0( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram59( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 59 OF 1240 *** + // Wavefunction(s) for diagram number 59 + FFV1_1( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] ); + // Amplitude(s) for diagram number 59 + FFV1_0( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram60( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 60 OF 1240 *** + // Wavefunction(s) for diagram number 60 + FFV1P0_3( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] ); + // Amplitude(s) for diagram number 60 + VVV1_0( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram61( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 61 OF 1240 *** + // Wavefunction(s) for diagram number 61 + // (none) + // Amplitude(s) for diagram number 61 + FFV1_0( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram62( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 62 OF 1240 *** + // Wavefunction(s) for diagram number 62 + // (none) + // Amplitude(s) for diagram number 62 + VVV1_0( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram63( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 63 OF 1240 *** + // Wavefunction(s) for diagram number 63 + // (none) + // Amplitude(s) for diagram number 63 + FFV1_0( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram64( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 64 OF 1240 *** + // Wavefunction(s) for diagram number 64 + // (none) + // Amplitude(s) for diagram number 64 + FFV1_0( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram65( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 65 OF 1240 *** + // Wavefunction(s) for diagram number 65 + FFV1_1( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); + // Amplitude(s) for diagram number 65 + FFV1_0( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram66( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 66 OF 1240 *** + // Wavefunction(s) for diagram number 66 + // (none) + // Amplitude(s) for diagram number 66 + FFV1_0( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram67( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 67 OF 1240 *** + // Wavefunction(s) for diagram number 67 + // (none) + // Amplitude(s) for diagram number 67 + FFV1_0( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram68( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 68 OF 1240 *** + // Wavefunction(s) for diagram number 68 + // (none) + // Amplitude(s) for diagram number 68 + FFV1_0( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram69( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 69 OF 1240 *** + // Wavefunction(s) for diagram number 69 + // (none) + // Amplitude(s) for diagram number 69 + FFV1_0( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram70( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 70 OF 1240 *** + // Wavefunction(s) for diagram number 70 + // (none) + // Amplitude(s) for diagram number 70 + FFV1_0( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram71( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 71 OF 1240 *** + // Wavefunction(s) for diagram number 71 + // (none) + // Amplitude(s) for diagram number 71 + FFV1_0( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram72( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 72 OF 1240 *** + // Wavefunction(s) for diagram number 72 + // (none) + // Amplitude(s) for diagram number 72 + FFV1_0( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram73( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 73 OF 1240 *** + // Wavefunction(s) for diagram number 73 + // (none) + // Amplitude(s) for diagram number 73 + FFV1_0( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram74( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 74 OF 1240 *** + // Wavefunction(s) for diagram number 74 + FFV1_1( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); + FFV1_2( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); + // Amplitude(s) for diagram number 74 + FFV1_0( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram75( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 75 OF 1240 *** + // Wavefunction(s) for diagram number 75 + FFV1_2( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] ); + // Amplitude(s) for diagram number 75 + FFV1_0( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram76( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 76 OF 1240 *** + // Wavefunction(s) for diagram number 76 + FFV1P0_3( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] ); + // Amplitude(s) for diagram number 76 + VVV1_0( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram77( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 77 OF 1240 *** + // Wavefunction(s) for diagram number 77 + // (none) + // Amplitude(s) for diagram number 77 + FFV1_0( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram78( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 78 OF 1240 *** + // Wavefunction(s) for diagram number 78 + // (none) + // Amplitude(s) for diagram number 78 + VVV1_0( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram79( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 79 OF 1240 *** + // Wavefunction(s) for diagram number 79 + // (none) + // Amplitude(s) for diagram number 79 + FFV1_0( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram80( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 80 OF 1240 *** + // Wavefunction(s) for diagram number 80 + // (none) + // Amplitude(s) for diagram number 80 + FFV1_0( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram81( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 81 OF 1240 *** + // Wavefunction(s) for diagram number 81 + // (none) + // Amplitude(s) for diagram number 81 + FFV1_0( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram82( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 82 OF 1240 *** + // Wavefunction(s) for diagram number 82 + // (none) + // Amplitude(s) for diagram number 82 + FFV1_0( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram83( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 83 OF 1240 *** + // Wavefunction(s) for diagram number 83 + // (none) + // Amplitude(s) for diagram number 83 + FFV1_0( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram84( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 84 OF 1240 *** + // Wavefunction(s) for diagram number 84 + FFV1_2( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] ); + // Amplitude(s) for diagram number 84 + FFV1_0( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram85( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 85 OF 1240 *** + // Wavefunction(s) for diagram number 85 + FFV1_2( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] ); + // Amplitude(s) for diagram number 85 + FFV1_0( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram86( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 86 OF 1240 *** + // Wavefunction(s) for diagram number 86 + FFV1P0_3( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 86 + VVV1_0( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram87( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 87 OF 1240 *** + // Wavefunction(s) for diagram number 87 + // (none) + // Amplitude(s) for diagram number 87 + FFV1_0( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram88( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 88 OF 1240 *** + // Wavefunction(s) for diagram number 88 + // (none) + // Amplitude(s) for diagram number 88 + VVV1_0( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram89( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 89 OF 1240 *** + // Wavefunction(s) for diagram number 89 + // (none) + // Amplitude(s) for diagram number 89 + FFV1_0( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram90( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 90 OF 1240 *** + // Wavefunction(s) for diagram number 90 + // (none) + // Amplitude(s) for diagram number 90 + FFV1_0( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram91( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 91 OF 1240 *** + // Wavefunction(s) for diagram number 91 + // (none) + // Amplitude(s) for diagram number 91 + FFV1_0( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram92( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 92 OF 1240 *** + // Wavefunction(s) for diagram number 92 + // (none) + // Amplitude(s) for diagram number 92 + FFV1_0( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram93( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 93 OF 1240 *** + // Wavefunction(s) for diagram number 93 + // (none) + // Amplitude(s) for diagram number 93 + FFV1_0( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram94( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 94 OF 1240 *** + // Wavefunction(s) for diagram number 94 + FFV1_2( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] ); + // Amplitude(s) for diagram number 94 + FFV1_0( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram95( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 95 OF 1240 *** + // Wavefunction(s) for diagram number 95 + FFV1_2( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] ); + // Amplitude(s) for diagram number 95 + FFV1_0( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram96( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 96 OF 1240 *** + // Wavefunction(s) for diagram number 96 + FFV1P0_3( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] ); + // Amplitude(s) for diagram number 96 + VVV1_0( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram97( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 97 OF 1240 *** + // Wavefunction(s) for diagram number 97 + // (none) + // Amplitude(s) for diagram number 97 + FFV1_0( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram98( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 98 OF 1240 *** + // Wavefunction(s) for diagram number 98 + // (none) + // Amplitude(s) for diagram number 98 + VVV1_0( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram99( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 99 OF 1240 *** + // Wavefunction(s) for diagram number 99 + // (none) + // Amplitude(s) for diagram number 99 + FFV1_0( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram100( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 100 OF 1240 *** + // Wavefunction(s) for diagram number 100 + // (none) + // Amplitude(s) for diagram number 100 + FFV1_0( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram101( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 101 OF 1240 *** + // Wavefunction(s) for diagram number 101 + // (none) + // Amplitude(s) for diagram number 101 + FFV1_0( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram102( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 102 OF 1240 *** + // Wavefunction(s) for diagram number 102 + // (none) + // Amplitude(s) for diagram number 102 + FFV1_0( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram103( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 103 OF 1240 *** + // Wavefunction(s) for diagram number 103 + // (none) + // Amplitude(s) for diagram number 103 + FFV1_0( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram104( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 104 OF 1240 *** + // Wavefunction(s) for diagram number 104 + FFV1_2( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] ); + // Amplitude(s) for diagram number 104 + FFV1_0( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram105( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 105 OF 1240 *** + // Wavefunction(s) for diagram number 105 + VVV1P0_1( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] ); + // Amplitude(s) for diagram number 105 + FFV1_0( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram106( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 106 OF 1240 *** + // Wavefunction(s) for diagram number 106 + FFV1_1( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); + // Amplitude(s) for diagram number 106 + FFV1_0( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram107( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 107 OF 1240 *** + // Wavefunction(s) for diagram number 107 + // (none) + // Amplitude(s) for diagram number 107 + FFV1_0( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram108( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 108 OF 1240 *** + // Wavefunction(s) for diagram number 108 + // (none) + // Amplitude(s) for diagram number 108 + FFV1_0( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram109( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 109 OF 1240 *** + // Wavefunction(s) for diagram number 109 + // (none) + // Amplitude(s) for diagram number 109 + FFV1_0( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram110( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 110 OF 1240 *** + // Wavefunction(s) for diagram number 110 + FFV1_2( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 110 + FFV1_0( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram111( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 111 OF 1240 *** + // Wavefunction(s) for diagram number 111 + VVV1P0_1( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] ); + // Amplitude(s) for diagram number 111 + FFV1_0( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram112( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 112 OF 1240 *** + // Wavefunction(s) for diagram number 112 + FFV1_1( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); + // Amplitude(s) for diagram number 112 + FFV1_0( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram113( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 113 OF 1240 *** + // Wavefunction(s) for diagram number 113 + // (none) + // Amplitude(s) for diagram number 113 + FFV1_0( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram114( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 114 OF 1240 *** + // Wavefunction(s) for diagram number 114 + // (none) + // Amplitude(s) for diagram number 114 + FFV1_0( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram115( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 115 OF 1240 *** + // Wavefunction(s) for diagram number 115 + // (none) + // Amplitude(s) for diagram number 115 + FFV1_0( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram116( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 116 OF 1240 *** + // Wavefunction(s) for diagram number 116 + FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 116 + FFV1_0( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram117( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 117 OF 1240 *** + // Wavefunction(s) for diagram number 117 + VVV1P0_1( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] ); + // Amplitude(s) for diagram number 117 + FFV1_0( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram118( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 118 OF 1240 *** + // Wavefunction(s) for diagram number 118 + FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] ); + // Amplitude(s) for diagram number 118 + FFV1_0( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram119( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 119 OF 1240 *** + // Wavefunction(s) for diagram number 119 + // (none) + // Amplitude(s) for diagram number 119 + FFV1_0( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram120( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 120 OF 1240 *** + // Wavefunction(s) for diagram number 120 + // (none) + // Amplitude(s) for diagram number 120 + FFV1_0( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram121( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 121 OF 1240 *** + // Wavefunction(s) for diagram number 121 + // (none) + // Amplitude(s) for diagram number 121 + FFV1_0( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram122( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 122 OF 1240 *** + // Wavefunction(s) for diagram number 122 + // (none) + // Amplitude(s) for diagram number 122 + FFV1_0( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram123( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 123 OF 1240 *** + // Wavefunction(s) for diagram number 123 + // (none) + // Amplitude(s) for diagram number 123 + FFV1_0( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram124( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 124 OF 1240 *** + // Wavefunction(s) for diagram number 124 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] ); + FFV1_1( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); + // Amplitude(s) for diagram number 124 + FFV1_0( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram125( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 125 OF 1240 *** + // Wavefunction(s) for diagram number 125 + FFV1_2( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 125 + FFV1_0( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram126( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 126 OF 1240 *** + // Wavefunction(s) for diagram number 126 + FFV1_1( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] ); + FFV1_2( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] ); + // Amplitude(s) for diagram number 126 + FFV1_0( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram127( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 127 OF 1240 *** + // Wavefunction(s) for diagram number 127 + // (none) + // Amplitude(s) for diagram number 127 + FFV1_0( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram128( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 128 OF 1240 *** + // Wavefunction(s) for diagram number 128 + FFV1_1( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] ); + // Amplitude(s) for diagram number 128 + FFV1_0( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram129( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 129 OF 1240 *** + // Wavefunction(s) for diagram number 129 + // (none) + // Amplitude(s) for diagram number 129 + FFV1_0( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram130( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 130 OF 1240 *** + // Wavefunction(s) for diagram number 130 + FFV1P0_3( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] ); + // Amplitude(s) for diagram number 130 + VVV1_0( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram131( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 131 OF 1240 *** + // Wavefunction(s) for diagram number 131 + FFV1_1( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); + // Amplitude(s) for diagram number 131 + FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram132( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 132 OF 1240 *** + // Wavefunction(s) for diagram number 132 + // (none) + // Amplitude(s) for diagram number 132 + FFV1_0( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram133( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 133 OF 1240 *** + // Wavefunction(s) for diagram number 133 + // (none) + // Amplitude(s) for diagram number 133 + VVV1_0( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram134( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 134 OF 1240 *** + // Wavefunction(s) for diagram number 134 + FFV1_1( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); + // Amplitude(s) for diagram number 134 + FFV1_0( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram135( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 135 OF 1240 *** + // Wavefunction(s) for diagram number 135 + // (none) + // Amplitude(s) for diagram number 135 + FFV1_0( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram136( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 136 OF 1240 *** + // Wavefunction(s) for diagram number 136 + // (none) + // Amplitude(s) for diagram number 136 + VVV1_0( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram137( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 137 OF 1240 *** + // Wavefunction(s) for diagram number 137 + // (none) + // Amplitude(s) for diagram number 137 + FFV1_0( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram138( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 138 OF 1240 *** + // Wavefunction(s) for diagram number 138 + FFV1_1( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] ); + // Amplitude(s) for diagram number 138 + FFV1_0( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram139( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 139 OF 1240 *** + // Wavefunction(s) for diagram number 139 + // (none) + // Amplitude(s) for diagram number 139 + FFV1_0( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram140( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 140 OF 1240 *** + // Wavefunction(s) for diagram number 140 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] ); + FFV1P0_3( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] ); + VVV1P0_1( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] ); + // Amplitude(s) for diagram number 140 + VVV1_0( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram141( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 141 OF 1240 *** + // Wavefunction(s) for diagram number 141 + VVV1P0_1( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] ); + // Amplitude(s) for diagram number 141 + VVV1_0( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram142( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 142 OF 1240 *** + // Wavefunction(s) for diagram number 142 + // (none) + // Amplitude(s) for diagram number 142 + VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram143( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 143 OF 1240 *** + // Wavefunction(s) for diagram number 143 + FFV1_2( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] ); + // Amplitude(s) for diagram number 143 + FFV1_0( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram144( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 144 OF 1240 *** + // Wavefunction(s) for diagram number 144 + // (none) + // Amplitude(s) for diagram number 144 + FFV1_0( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram145( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 145 OF 1240 *** + // Wavefunction(s) for diagram number 145 + // (none) + // Amplitude(s) for diagram number 145 + FFV1_0( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram146( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 146 OF 1240 *** + // Wavefunction(s) for diagram number 146 + // (none) + // Amplitude(s) for diagram number 146 + FFV1_0( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram147( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 147 OF 1240 *** + // Wavefunction(s) for diagram number 147 + FFV1_1( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] ); + // Amplitude(s) for diagram number 147 + FFV1_0( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram148( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 148 OF 1240 *** + // Wavefunction(s) for diagram number 148 + FFV1P0_3( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] ); + // Amplitude(s) for diagram number 148 + VVV1_0( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram149( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 149 OF 1240 *** + // Wavefunction(s) for diagram number 149 + // (none) + // Amplitude(s) for diagram number 149 + FFV1_0( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram150( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 150 OF 1240 *** + // Wavefunction(s) for diagram number 150 + // (none) + // Amplitude(s) for diagram number 150 + FFV1_0( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram151( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 151 OF 1240 *** + // Wavefunction(s) for diagram number 151 + FFV1P0_3( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] ); + // Amplitude(s) for diagram number 151 + VVV1_0( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram152( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 152 OF 1240 *** + // Wavefunction(s) for diagram number 152 + // (none) + // Amplitude(s) for diagram number 152 + FFV1_0( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram153( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 153 OF 1240 *** + // Wavefunction(s) for diagram number 153 + // (none) + // Amplitude(s) for diagram number 153 + FFV1_0( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram154( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 154 OF 1240 *** + // Wavefunction(s) for diagram number 154 + // (none) + // Amplitude(s) for diagram number 154 + VVV1_0( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram155( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 155 OF 1240 *** + // Wavefunction(s) for diagram number 155 + // (none) + // Amplitude(s) for diagram number 155 + FFV1_0( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram156( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 156 OF 1240 *** + // Wavefunction(s) for diagram number 156 + VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] ); + VVV1P0_1( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] ); + // Amplitude(s) for diagram number 156 + VVV1_0( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram157( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 157 OF 1240 *** + // Wavefunction(s) for diagram number 157 + VVV1P0_1( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] ); + // Amplitude(s) for diagram number 157 + VVV1_0( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram158( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 158 OF 1240 *** + // Wavefunction(s) for diagram number 158 + // (none) + // Amplitude(s) for diagram number 158 + VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram159( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 159 OF 1240 *** + // Wavefunction(s) for diagram number 159 + FFV1_2( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); + // Amplitude(s) for diagram number 159 + FFV1_0( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram160( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 160 OF 1240 *** + // Wavefunction(s) for diagram number 160 + // (none) + // Amplitude(s) for diagram number 160 + FFV1_0( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram161( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 161 OF 1240 *** + // Wavefunction(s) for diagram number 161 + // (none) + // Amplitude(s) for diagram number 161 + FFV1_0( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram162( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 162 OF 1240 *** + // Wavefunction(s) for diagram number 162 + // (none) + // Amplitude(s) for diagram number 162 + FFV1_0( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram163( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 163 OF 1240 *** + // Wavefunction(s) for diagram number 163 + FFV1_1( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] ); + // Amplitude(s) for diagram number 163 + FFV1_0( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram164( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 164 OF 1240 *** + // Wavefunction(s) for diagram number 164 + FFV1P0_3( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] ); + // Amplitude(s) for diagram number 164 + VVV1_0( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram165( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 165 OF 1240 *** + // Wavefunction(s) for diagram number 165 + // (none) + // Amplitude(s) for diagram number 165 + FFV1_0( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram166( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 166 OF 1240 *** + // Wavefunction(s) for diagram number 166 + // (none) + // Amplitude(s) for diagram number 166 + FFV1_0( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram167( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 167 OF 1240 *** + // Wavefunction(s) for diagram number 167 + // (none) + // Amplitude(s) for diagram number 167 + VVV1_0( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram168( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 168 OF 1240 *** + // Wavefunction(s) for diagram number 168 + // (none) + // Amplitude(s) for diagram number 168 + FFV1_0( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram169( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 169 OF 1240 *** + // Wavefunction(s) for diagram number 169 + // (none) + // Amplitude(s) for diagram number 169 + FFV1_0( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram170( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 170 OF 1240 *** + // Wavefunction(s) for diagram number 170 + // (none) + // Amplitude(s) for diagram number 170 + VVV1_0( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram171( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 171 OF 1240 *** + // Wavefunction(s) for diagram number 171 + // (none) + // Amplitude(s) for diagram number 171 + FFV1_0( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram172( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 172 OF 1240 *** + // Wavefunction(s) for diagram number 172 + VVV1P0_1( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] ); + VVV1P0_1( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] ); + // Amplitude(s) for diagram number 172 + VVV1_0( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram173( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 173 OF 1240 *** + // Wavefunction(s) for diagram number 173 + VVV1P0_1( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] ); + // Amplitude(s) for diagram number 173 + VVV1_0( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram174( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 174 OF 1240 *** + // Wavefunction(s) for diagram number 174 + // (none) + // Amplitude(s) for diagram number 174 + VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram175( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 175 OF 1240 *** + // Wavefunction(s) for diagram number 175 + FFV1_2( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] ); + // Amplitude(s) for diagram number 175 + FFV1_0( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram176( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 176 OF 1240 *** + // Wavefunction(s) for diagram number 176 + // (none) + // Amplitude(s) for diagram number 176 + FFV1_0( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram177( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 177 OF 1240 *** + // Wavefunction(s) for diagram number 177 + // (none) + // Amplitude(s) for diagram number 177 + FFV1_0( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram178( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 178 OF 1240 *** + // Wavefunction(s) for diagram number 178 + // (none) + // Amplitude(s) for diagram number 178 + FFV1_0( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram179( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 179 OF 1240 *** + // Wavefunction(s) for diagram number 179 + FFV1_1( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); + // Amplitude(s) for diagram number 179 + FFV1_0( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram180( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 180 OF 1240 *** + // Wavefunction(s) for diagram number 180 + // (none) + // Amplitude(s) for diagram number 180 + VVV1_0( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram181( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 181 OF 1240 *** + // Wavefunction(s) for diagram number 181 + // (none) + // Amplitude(s) for diagram number 181 + FFV1_0( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram182( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 182 OF 1240 *** + // Wavefunction(s) for diagram number 182 + // (none) + // Amplitude(s) for diagram number 182 + FFV1_0( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram183( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 183 OF 1240 *** + // Wavefunction(s) for diagram number 183 + // (none) + // Amplitude(s) for diagram number 183 + VVV1_0( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram184( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 184 OF 1240 *** + // Wavefunction(s) for diagram number 184 + // (none) + // Amplitude(s) for diagram number 184 + FFV1_0( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram185( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 185 OF 1240 *** + // Wavefunction(s) for diagram number 185 + // (none) + // Amplitude(s) for diagram number 185 + FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram186( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 186 OF 1240 *** + // Wavefunction(s) for diagram number 186 + // (none) + // Amplitude(s) for diagram number 186 + VVV1_0( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram187( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 187 OF 1240 *** + // Wavefunction(s) for diagram number 187 + // (none) + // Amplitude(s) for diagram number 187 + FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram188( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 188 OF 1240 *** + // Wavefunction(s) for diagram number 188 + FFV1_1( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); + // Amplitude(s) for diagram number 188 + FFV1_0( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram189( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 189 OF 1240 *** + // Wavefunction(s) for diagram number 189 + // (none) + // Amplitude(s) for diagram number 189 + FFV1_0( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram190( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 190 OF 1240 *** + // Wavefunction(s) for diagram number 190 + FFV1_2( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] ); + // Amplitude(s) for diagram number 190 + FFV1_0( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram191( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 191 OF 1240 *** + // Wavefunction(s) for diagram number 191 + // (none) + // Amplitude(s) for diagram number 191 + FFV1_0( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram192( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 192 OF 1240 *** + // Wavefunction(s) for diagram number 192 + // (none) + // Amplitude(s) for diagram number 192 + FFV1_0( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram193( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 193 OF 1240 *** + // Wavefunction(s) for diagram number 193 + // (none) + // Amplitude(s) for diagram number 193 + FFV1_0( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram194( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 194 OF 1240 *** + // Wavefunction(s) for diagram number 194 + // (none) + // Amplitude(s) for diagram number 194 + FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram195( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 195 OF 1240 *** + // Wavefunction(s) for diagram number 195 + // (none) + // Amplitude(s) for diagram number 195 + VVV1_0( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram196( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 196 OF 1240 *** + // Wavefunction(s) for diagram number 196 + // (none) + // Amplitude(s) for diagram number 196 + FFV1_0( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram197( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 197 OF 1240 *** + // Wavefunction(s) for diagram number 197 + // (none) + // Amplitude(s) for diagram number 197 + FFV1_0( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram198( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 198 OF 1240 *** + // Wavefunction(s) for diagram number 198 + // (none) + // Amplitude(s) for diagram number 198 + FFV1_0( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram199( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 199 OF 1240 *** + // Wavefunction(s) for diagram number 199 + FFV1_2( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] ); + // Amplitude(s) for diagram number 199 + FFV1_0( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram200( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 200 OF 1240 *** + // Wavefunction(s) for diagram number 200 + // (none) + // Amplitude(s) for diagram number 200 + FFV1_0( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram201( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 201 OF 1240 *** + // Wavefunction(s) for diagram number 201 + // (none) + // Amplitude(s) for diagram number 201 + FFV1_0( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram202( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 202 OF 1240 *** + // Wavefunction(s) for diagram number 202 + // (none) + // Amplitude(s) for diagram number 202 + FFV1_0( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram203( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 203 OF 1240 *** + // Wavefunction(s) for diagram number 203 + // (none) + // Amplitude(s) for diagram number 203 + FFV1_0( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram204( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 204 OF 1240 *** + // Wavefunction(s) for diagram number 204 + // (none) + // Amplitude(s) for diagram number 204 + VVV1_0( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram205( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 205 OF 1240 *** + // Wavefunction(s) for diagram number 205 + // (none) + // Amplitude(s) for diagram number 205 + FFV1_0( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram206( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 206 OF 1240 *** + // Wavefunction(s) for diagram number 206 + // (none) + // Amplitude(s) for diagram number 206 + FFV1_0( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram207( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 207 OF 1240 *** + // Wavefunction(s) for diagram number 207 + // (none) + // Amplitude(s) for diagram number 207 + FFV1_0( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram208( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 208 OF 1240 *** + // Wavefunction(s) for diagram number 208 + FFV1_2( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); + // Amplitude(s) for diagram number 208 + FFV1_0( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram209( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 209 OF 1240 *** + // Wavefunction(s) for diagram number 209 + // (none) + // Amplitude(s) for diagram number 209 + FFV1_0( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram210( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 210 OF 1240 *** + // Wavefunction(s) for diagram number 210 + // (none) + // Amplitude(s) for diagram number 210 + FFV1_0( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram211( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 211 OF 1240 *** + // Wavefunction(s) for diagram number 211 + // (none) + // Amplitude(s) for diagram number 211 + FFV1_0( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram212( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 212 OF 1240 *** + // Wavefunction(s) for diagram number 212 + // (none) + // Amplitude(s) for diagram number 212 + FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram213( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 213 OF 1240 *** + // Wavefunction(s) for diagram number 213 + // (none) + // Amplitude(s) for diagram number 213 + VVV1_0( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram214( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 214 OF 1240 *** + // Wavefunction(s) for diagram number 214 + // (none) + // Amplitude(s) for diagram number 214 + FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram215( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 215 OF 1240 *** + // Wavefunction(s) for diagram number 215 + // (none) + // Amplitude(s) for diagram number 215 + FFV1_0( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram216( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 216 OF 1240 *** + // Wavefunction(s) for diagram number 216 + // (none) + // Amplitude(s) for diagram number 216 + FFV1_0( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram217( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 217 OF 1240 *** + // Wavefunction(s) for diagram number 217 + VVV1P0_1( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] ); + // Amplitude(s) for diagram number 217 + VVV1_0( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram218( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 218 OF 1240 *** + // Wavefunction(s) for diagram number 218 + // (none) + // Amplitude(s) for diagram number 218 + VVV1_0( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram219( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 219 OF 1240 *** + // Wavefunction(s) for diagram number 219 + // (none) + // Amplitude(s) for diagram number 219 + VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram220( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 220 OF 1240 *** + // Wavefunction(s) for diagram number 220 + // (none) + // Amplitude(s) for diagram number 220 + FFV1_0( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram221( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 221 OF 1240 *** + // Wavefunction(s) for diagram number 221 + // (none) + // Amplitude(s) for diagram number 221 + FFV1_0( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram222( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 222 OF 1240 *** + // Wavefunction(s) for diagram number 222 + // (none) + // Amplitude(s) for diagram number 222 + FFV1_0( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram223( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 223 OF 1240 *** + // Wavefunction(s) for diagram number 223 + // (none) + // Amplitude(s) for diagram number 223 + FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram224( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 224 OF 1240 *** + // Wavefunction(s) for diagram number 224 + VVV1P0_1( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] ); + // Amplitude(s) for diagram number 224 + VVV1_0( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram225( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 225 OF 1240 *** + // Wavefunction(s) for diagram number 225 + // (none) + // Amplitude(s) for diagram number 225 + VVV1_0( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram226( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 226 OF 1240 *** + // Wavefunction(s) for diagram number 226 + // (none) + // Amplitude(s) for diagram number 226 + VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram227( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 227 OF 1240 *** + // Wavefunction(s) for diagram number 227 + // (none) + // Amplitude(s) for diagram number 227 + FFV1_0( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram228( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 228 OF 1240 *** + // Wavefunction(s) for diagram number 228 + // (none) + // Amplitude(s) for diagram number 228 + FFV1_0( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram229( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 229 OF 1240 *** + // Wavefunction(s) for diagram number 229 + // (none) + // Amplitude(s) for diagram number 229 + FFV1_0( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram230( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 230 OF 1240 *** + // Wavefunction(s) for diagram number 230 + // (none) + // Amplitude(s) for diagram number 230 + FFV1_0( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram231( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 231 OF 1240 *** + // Wavefunction(s) for diagram number 231 + VVV1P0_1( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] ); + // Amplitude(s) for diagram number 231 + VVV1_0( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram232( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 232 OF 1240 *** + // Wavefunction(s) for diagram number 232 + // (none) + // Amplitude(s) for diagram number 232 + VVV1_0( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram233( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 233 OF 1240 *** + // Wavefunction(s) for diagram number 233 + // (none) + // Amplitude(s) for diagram number 233 + VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram234( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 234 OF 1240 *** + // Wavefunction(s) for diagram number 234 + // (none) + // Amplitude(s) for diagram number 234 + FFV1_0( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram235( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 235 OF 1240 *** + // Wavefunction(s) for diagram number 235 + // (none) + // Amplitude(s) for diagram number 235 + FFV1_0( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram236( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 236 OF 1240 *** + // Wavefunction(s) for diagram number 236 + VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] ); + VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] ); + VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] ); + // Amplitude(s) for diagram number 236 + VVV1_0( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram237( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 237 OF 1240 *** + // Wavefunction(s) for diagram number 237 + // (none) + // Amplitude(s) for diagram number 237 + FFV1_0( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram238( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 238 OF 1240 *** + // Wavefunction(s) for diagram number 238 + // (none) + // Amplitude(s) for diagram number 238 + FFV1_0( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram239( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 239 OF 1240 *** + // Wavefunction(s) for diagram number 239 + VVVV1P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] ); + VVVV3P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] ); + VVVV4P0_1( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] ); + // Amplitude(s) for diagram number 239 + VVV1_0( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram240( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 240 OF 1240 *** + // Wavefunction(s) for diagram number 240 + // (none) + // Amplitude(s) for diagram number 240 + FFV1_0( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram241( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 241 OF 1240 *** + // Wavefunction(s) for diagram number 241 + // (none) + // Amplitude(s) for diagram number 241 + FFV1_0( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram242( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 242 OF 1240 *** + // Wavefunction(s) for diagram number 242 + VVVV1P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] ); + VVVV3P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] ); + VVVV4P0_1( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] ); + // Amplitude(s) for diagram number 242 + VVV1_0( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram243( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 243 OF 1240 *** + // Wavefunction(s) for diagram number 243 + // (none) + // Amplitude(s) for diagram number 243 + FFV1_0( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram244( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 244 OF 1240 *** + // Wavefunction(s) for diagram number 244 + // (none) + // Amplitude(s) for diagram number 244 + FFV1_0( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram245( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 245 OF 1240 *** + // Wavefunction(s) for diagram number 245 + // (none) + // Amplitude(s) for diagram number 245 + FFV1_0( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram246( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 246 OF 1240 *** + // Wavefunction(s) for diagram number 246 + // (none) + // Amplitude(s) for diagram number 246 + VVV1_0( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram247( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 247 OF 1240 *** + // Wavefunction(s) for diagram number 247 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); + FFV1_2( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] ); + FFV1_1( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 247 + FFV1_0( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram248( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 248 OF 1240 *** + // Wavefunction(s) for diagram number 248 + FFV1_1( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] ); + // Amplitude(s) for diagram number 248 + FFV1_0( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram249( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 249 OF 1240 *** + // Wavefunction(s) for diagram number 249 + FFV1_2( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] ); + FFV1_1( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] ); + // Amplitude(s) for diagram number 249 + FFV1_0( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram250( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 250 OF 1240 *** + // Wavefunction(s) for diagram number 250 + // (none) + // Amplitude(s) for diagram number 250 + FFV1_0( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram251( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 251 OF 1240 *** + // Wavefunction(s) for diagram number 251 + FFV1_2( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] ); + // Amplitude(s) for diagram number 251 + FFV1_0( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram252( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 252 OF 1240 *** + // Wavefunction(s) for diagram number 252 + // (none) + // Amplitude(s) for diagram number 252 + FFV1_0( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram253( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 253 OF 1240 *** + // Wavefunction(s) for diagram number 253 + FFV1P0_3( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] ); + // Amplitude(s) for diagram number 253 + VVV1_0( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram254( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 254 OF 1240 *** + // Wavefunction(s) for diagram number 254 + FFV1_2( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); + // Amplitude(s) for diagram number 254 + FFV1_0( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram255( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 255 OF 1240 *** + // Wavefunction(s) for diagram number 255 + // (none) + // Amplitude(s) for diagram number 255 + FFV1_0( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram256( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 256 OF 1240 *** + // Wavefunction(s) for diagram number 256 + // (none) + // Amplitude(s) for diagram number 256 + VVV1_0( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram257( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 257 OF 1240 *** + // Wavefunction(s) for diagram number 257 + FFV1_2( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] ); + // Amplitude(s) for diagram number 257 + FFV1_0( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram258( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 258 OF 1240 *** + // Wavefunction(s) for diagram number 258 + // (none) + // Amplitude(s) for diagram number 258 + FFV1_0( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram259( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 259 OF 1240 *** + // Wavefunction(s) for diagram number 259 + // (none) + // Amplitude(s) for diagram number 259 + VVV1_0( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram260( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 260 OF 1240 *** + // Wavefunction(s) for diagram number 260 + // (none) + // Amplitude(s) for diagram number 260 + FFV1_0( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram261( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 261 OF 1240 *** + // Wavefunction(s) for diagram number 261 + FFV1_2( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] ); + // Amplitude(s) for diagram number 261 + FFV1_0( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram262( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 262 OF 1240 *** + // Wavefunction(s) for diagram number 262 + // (none) + // Amplitude(s) for diagram number 262 + FFV1_0( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram263( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 263 OF 1240 *** + // Wavefunction(s) for diagram number 263 + FFV1P0_3( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] ); + // Amplitude(s) for diagram number 263 + VVV1_0( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram264( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 264 OF 1240 *** + // Wavefunction(s) for diagram number 264 + // (none) + // Amplitude(s) for diagram number 264 + VVV1_0( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram265( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 265 OF 1240 *** + // Wavefunction(s) for diagram number 265 + // (none) + // Amplitude(s) for diagram number 265 + VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram266( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 266 OF 1240 *** + // Wavefunction(s) for diagram number 266 + FFV1_1( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] ); + // Amplitude(s) for diagram number 266 + FFV1_0( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram267( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 267 OF 1240 *** + // Wavefunction(s) for diagram number 267 + // (none) + // Amplitude(s) for diagram number 267 + FFV1_0( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram268( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 268 OF 1240 *** + // Wavefunction(s) for diagram number 268 + // (none) + // Amplitude(s) for diagram number 268 + FFV1_0( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram269( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 269 OF 1240 *** + // Wavefunction(s) for diagram number 269 + // (none) + // Amplitude(s) for diagram number 269 + FFV1_0( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram270( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 270 OF 1240 *** + // Wavefunction(s) for diagram number 270 + FFV1_2( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] ); + // Amplitude(s) for diagram number 270 + FFV1_0( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram271( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 271 OF 1240 *** + // Wavefunction(s) for diagram number 271 + FFV1P0_3( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] ); + // Amplitude(s) for diagram number 271 + VVV1_0( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram272( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 272 OF 1240 *** + // Wavefunction(s) for diagram number 272 + // (none) + // Amplitude(s) for diagram number 272 + FFV1_0( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram273( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 273 OF 1240 *** + // Wavefunction(s) for diagram number 273 + // (none) + // Amplitude(s) for diagram number 273 + FFV1_0( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram274( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 274 OF 1240 *** + // Wavefunction(s) for diagram number 274 + FFV1P0_3( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] ); + // Amplitude(s) for diagram number 274 + VVV1_0( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram275( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 275 OF 1240 *** + // Wavefunction(s) for diagram number 275 + // (none) + // Amplitude(s) for diagram number 275 + FFV1_0( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram276( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 276 OF 1240 *** + // Wavefunction(s) for diagram number 276 + // (none) + // Amplitude(s) for diagram number 276 + FFV1_0( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram277( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 277 OF 1240 *** + // Wavefunction(s) for diagram number 277 + // (none) + // Amplitude(s) for diagram number 277 + VVV1_0( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram278( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 278 OF 1240 *** + // Wavefunction(s) for diagram number 278 + // (none) + // Amplitude(s) for diagram number 278 + FFV1_0( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram279( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 279 OF 1240 *** + // Wavefunction(s) for diagram number 279 + // (none) + // Amplitude(s) for diagram number 279 + VVV1_0( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram280( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 280 OF 1240 *** + // Wavefunction(s) for diagram number 280 + // (none) + // Amplitude(s) for diagram number 280 + VVV1_0( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram281( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 281 OF 1240 *** + // Wavefunction(s) for diagram number 281 + // (none) + // Amplitude(s) for diagram number 281 + VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram282( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 282 OF 1240 *** + // Wavefunction(s) for diagram number 282 + FFV1_1( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] ); + // Amplitude(s) for diagram number 282 + FFV1_0( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram283( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 283 OF 1240 *** + // Wavefunction(s) for diagram number 283 + // (none) + // Amplitude(s) for diagram number 283 + FFV1_0( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram284( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 284 OF 1240 *** + // Wavefunction(s) for diagram number 284 + // (none) + // Amplitude(s) for diagram number 284 + FFV1_0( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram285( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 285 OF 1240 *** + // Wavefunction(s) for diagram number 285 + // (none) + // Amplitude(s) for diagram number 285 + FFV1_0( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram286( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 286 OF 1240 *** + // Wavefunction(s) for diagram number 286 + FFV1_2( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] ); + // Amplitude(s) for diagram number 286 + FFV1_0( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram287( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 287 OF 1240 *** + // Wavefunction(s) for diagram number 287 + FFV1P0_3( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] ); + // Amplitude(s) for diagram number 287 + VVV1_0( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram288( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 288 OF 1240 *** + // Wavefunction(s) for diagram number 288 + // (none) + // Amplitude(s) for diagram number 288 + FFV1_0( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram289( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 289 OF 1240 *** + // Wavefunction(s) for diagram number 289 + // (none) + // Amplitude(s) for diagram number 289 + FFV1_0( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram290( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 290 OF 1240 *** + // Wavefunction(s) for diagram number 290 + // (none) + // Amplitude(s) for diagram number 290 + VVV1_0( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram291( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 291 OF 1240 *** + // Wavefunction(s) for diagram number 291 + // (none) + // Amplitude(s) for diagram number 291 + FFV1_0( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram292( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 292 OF 1240 *** + // Wavefunction(s) for diagram number 292 + // (none) + // Amplitude(s) for diagram number 292 + FFV1_0( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram293( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 293 OF 1240 *** + // Wavefunction(s) for diagram number 293 + // (none) + // Amplitude(s) for diagram number 293 + VVV1_0( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram294( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 294 OF 1240 *** + // Wavefunction(s) for diagram number 294 + // (none) + // Amplitude(s) for diagram number 294 + FFV1_0( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram295( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 295 OF 1240 *** + // Wavefunction(s) for diagram number 295 + // (none) + // Amplitude(s) for diagram number 295 + VVV1_0( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram296( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 296 OF 1240 *** + // Wavefunction(s) for diagram number 296 + // (none) + // Amplitude(s) for diagram number 296 + VVV1_0( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram297( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 297 OF 1240 *** + // Wavefunction(s) for diagram number 297 + // (none) + // Amplitude(s) for diagram number 297 + VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram298( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 298 OF 1240 *** + // Wavefunction(s) for diagram number 298 + FFV1_1( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] ); + // Amplitude(s) for diagram number 298 + FFV1_0( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram299( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 299 OF 1240 *** + // Wavefunction(s) for diagram number 299 + // (none) + // Amplitude(s) for diagram number 299 + FFV1_0( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram300( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 300 OF 1240 *** + // Wavefunction(s) for diagram number 300 + // (none) + // Amplitude(s) for diagram number 300 + FFV1_0( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram301( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 301 OF 1240 *** + // Wavefunction(s) for diagram number 301 + // (none) + // Amplitude(s) for diagram number 301 + FFV1_0( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram302( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 302 OF 1240 *** + // Wavefunction(s) for diagram number 302 + FFV1_2( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 302 + FFV1_0( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram303( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 303 OF 1240 *** + // Wavefunction(s) for diagram number 303 + // (none) + // Amplitude(s) for diagram number 303 + VVV1_0( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram304( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 304 OF 1240 *** + // Wavefunction(s) for diagram number 304 + // (none) + // Amplitude(s) for diagram number 304 + FFV1_0( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram305( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 305 OF 1240 *** + // Wavefunction(s) for diagram number 305 + // (none) + // Amplitude(s) for diagram number 305 + FFV1_0( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram306( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 306 OF 1240 *** + // Wavefunction(s) for diagram number 306 + // (none) + // Amplitude(s) for diagram number 306 + VVV1_0( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram307( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 307 OF 1240 *** + // Wavefunction(s) for diagram number 307 + // (none) + // Amplitude(s) for diagram number 307 + FFV1_0( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram308( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 308 OF 1240 *** + // Wavefunction(s) for diagram number 308 + // (none) + // Amplitude(s) for diagram number 308 + FFV1_0( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram309( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 309 OF 1240 *** + // Wavefunction(s) for diagram number 309 + // (none) + // Amplitude(s) for diagram number 309 + VVV1_0( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram310( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 310 OF 1240 *** + // Wavefunction(s) for diagram number 310 + // (none) + // Amplitude(s) for diagram number 310 + FFV1_0( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram311( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 311 OF 1240 *** + // Wavefunction(s) for diagram number 311 + FFV1_2( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 311 + FFV1_0( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram312( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 312 OF 1240 *** + // Wavefunction(s) for diagram number 312 + // (none) + // Amplitude(s) for diagram number 312 + FFV1_0( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram313( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 313 OF 1240 *** + // Wavefunction(s) for diagram number 313 + FFV1_1( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] ); + // Amplitude(s) for diagram number 313 + FFV1_0( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram314( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 314 OF 1240 *** + // Wavefunction(s) for diagram number 314 + // (none) + // Amplitude(s) for diagram number 314 + FFV1_0( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram315( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 315 OF 1240 *** + // Wavefunction(s) for diagram number 315 + // (none) + // Amplitude(s) for diagram number 315 + FFV1_0( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram316( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 316 OF 1240 *** + // Wavefunction(s) for diagram number 316 + // (none) + // Amplitude(s) for diagram number 316 + FFV1_0( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram317( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 317 OF 1240 *** + // Wavefunction(s) for diagram number 317 + // (none) + // Amplitude(s) for diagram number 317 + FFV1_0( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram318( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 318 OF 1240 *** + // Wavefunction(s) for diagram number 318 + // (none) + // Amplitude(s) for diagram number 318 + VVV1_0( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram319( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 319 OF 1240 *** + // Wavefunction(s) for diagram number 319 + // (none) + // Amplitude(s) for diagram number 319 + FFV1_0( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram320( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 320 OF 1240 *** + // Wavefunction(s) for diagram number 320 + // (none) + // Amplitude(s) for diagram number 320 + FFV1_0( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram321( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 321 OF 1240 *** + // Wavefunction(s) for diagram number 321 + // (none) + // Amplitude(s) for diagram number 321 + FFV1_0( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram322( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 322 OF 1240 *** + // Wavefunction(s) for diagram number 322 + FFV1_1( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] ); + // Amplitude(s) for diagram number 322 + FFV1_0( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram323( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 323 OF 1240 *** + // Wavefunction(s) for diagram number 323 + // (none) + // Amplitude(s) for diagram number 323 + FFV1_0( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram324( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 324 OF 1240 *** + // Wavefunction(s) for diagram number 324 + // (none) + // Amplitude(s) for diagram number 324 + FFV1_0( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram325( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 325 OF 1240 *** + // Wavefunction(s) for diagram number 325 + // (none) + // Amplitude(s) for diagram number 325 + FFV1_0( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram326( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 326 OF 1240 *** + // Wavefunction(s) for diagram number 326 + // (none) + // Amplitude(s) for diagram number 326 + FFV1_0( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram327( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 327 OF 1240 *** + // Wavefunction(s) for diagram number 327 + // (none) + // Amplitude(s) for diagram number 327 + VVV1_0( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram328( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 328 OF 1240 *** + // Wavefunction(s) for diagram number 328 + // (none) + // Amplitude(s) for diagram number 328 + FFV1_0( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram329( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 329 OF 1240 *** + // Wavefunction(s) for diagram number 329 + // (none) + // Amplitude(s) for diagram number 329 + FFV1_0( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram330( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 330 OF 1240 *** + // Wavefunction(s) for diagram number 330 + // (none) + // Amplitude(s) for diagram number 330 + FFV1_0( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram331( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 331 OF 1240 *** + // Wavefunction(s) for diagram number 331 + FFV1_1( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] ); + // Amplitude(s) for diagram number 331 + FFV1_0( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram332( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 332 OF 1240 *** + // Wavefunction(s) for diagram number 332 + // (none) + // Amplitude(s) for diagram number 332 + FFV1_0( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram333( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 333 OF 1240 *** + // Wavefunction(s) for diagram number 333 + // (none) + // Amplitude(s) for diagram number 333 + FFV1_0( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram334( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 334 OF 1240 *** + // Wavefunction(s) for diagram number 334 + // (none) + // Amplitude(s) for diagram number 334 + FFV1_0( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram335( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 335 OF 1240 *** + // Wavefunction(s) for diagram number 335 + // (none) + // Amplitude(s) for diagram number 335 + FFV1_0( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram336( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 336 OF 1240 *** + // Wavefunction(s) for diagram number 336 + // (none) + // Amplitude(s) for diagram number 336 + VVV1_0( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram337( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 337 OF 1240 *** + // Wavefunction(s) for diagram number 337 + // (none) + // Amplitude(s) for diagram number 337 + FFV1_0( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram338( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 338 OF 1240 *** + // Wavefunction(s) for diagram number 338 + // (none) + // Amplitude(s) for diagram number 338 + FFV1_0( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram339( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 339 OF 1240 *** + // Wavefunction(s) for diagram number 339 + // (none) + // Amplitude(s) for diagram number 339 + FFV1_0( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram340( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 340 OF 1240 *** + // Wavefunction(s) for diagram number 340 + // (none) + // Amplitude(s) for diagram number 340 + VVV1_0( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram341( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 341 OF 1240 *** + // Wavefunction(s) for diagram number 341 + // (none) + // Amplitude(s) for diagram number 341 + VVV1_0( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram342( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 342 OF 1240 *** + // Wavefunction(s) for diagram number 342 + // (none) + // Amplitude(s) for diagram number 342 + VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram343( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 343 OF 1240 *** + // Wavefunction(s) for diagram number 343 + // (none) + // Amplitude(s) for diagram number 343 + FFV1_0( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram344( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 344 OF 1240 *** + // Wavefunction(s) for diagram number 344 + // (none) + // Amplitude(s) for diagram number 344 + FFV1_0( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram345( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 345 OF 1240 *** + // Wavefunction(s) for diagram number 345 + // (none) + // Amplitude(s) for diagram number 345 + FFV1_0( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram346( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 346 OF 1240 *** + // Wavefunction(s) for diagram number 346 + // (none) + // Amplitude(s) for diagram number 346 + FFV1_0( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram347( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 347 OF 1240 *** + // Wavefunction(s) for diagram number 347 + // (none) + // Amplitude(s) for diagram number 347 + VVV1_0( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram348( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 348 OF 1240 *** + // Wavefunction(s) for diagram number 348 + // (none) + // Amplitude(s) for diagram number 348 + VVV1_0( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram349( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 349 OF 1240 *** + // Wavefunction(s) for diagram number 349 + // (none) + // Amplitude(s) for diagram number 349 + VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram350( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 350 OF 1240 *** + // Wavefunction(s) for diagram number 350 + // (none) + // Amplitude(s) for diagram number 350 + FFV1_0( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram351( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 351 OF 1240 *** + // Wavefunction(s) for diagram number 351 + // (none) + // Amplitude(s) for diagram number 351 + FFV1_0( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram352( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 352 OF 1240 *** + // Wavefunction(s) for diagram number 352 + // (none) + // Amplitude(s) for diagram number 352 + FFV1_0( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram353( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 353 OF 1240 *** + // Wavefunction(s) for diagram number 353 + // (none) + // Amplitude(s) for diagram number 353 + FFV1_0( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram354( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 354 OF 1240 *** + // Wavefunction(s) for diagram number 354 + // (none) + // Amplitude(s) for diagram number 354 + VVV1_0( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram355( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 355 OF 1240 *** + // Wavefunction(s) for diagram number 355 + // (none) + // Amplitude(s) for diagram number 355 + VVV1_0( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram356( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 356 OF 1240 *** + // Wavefunction(s) for diagram number 356 + // (none) + // Amplitude(s) for diagram number 356 + VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram357( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 357 OF 1240 *** + // Wavefunction(s) for diagram number 357 + // (none) + // Amplitude(s) for diagram number 357 + FFV1_0( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram358( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 358 OF 1240 *** + // Wavefunction(s) for diagram number 358 + // (none) + // Amplitude(s) for diagram number 358 + FFV1_0( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram359( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 359 OF 1240 *** + // Wavefunction(s) for diagram number 359 + // (none) + // Amplitude(s) for diagram number 359 + VVV1_0( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram360( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 360 OF 1240 *** + // Wavefunction(s) for diagram number 360 + // (none) + // Amplitude(s) for diagram number 360 + FFV1_0( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + FFV1_0( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + FFV1_0( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram361( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 361 OF 1240 *** + // Wavefunction(s) for diagram number 361 + // (none) + // Amplitude(s) for diagram number 361 + FFV1_0( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram362( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 362 OF 1240 *** + // Wavefunction(s) for diagram number 362 + // (none) + // Amplitude(s) for diagram number 362 + VVV1_0( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram363( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 363 OF 1240 *** + // Wavefunction(s) for diagram number 363 + // (none) + // Amplitude(s) for diagram number 363 + FFV1_0( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + FFV1_0( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + FFV1_0( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram364( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 364 OF 1240 *** + // Wavefunction(s) for diagram number 364 + // (none) + // Amplitude(s) for diagram number 364 + FFV1_0( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram365( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 365 OF 1240 *** + // Wavefunction(s) for diagram number 365 + // (none) + // Amplitude(s) for diagram number 365 + VVV1_0( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram366( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 366 OF 1240 *** + // Wavefunction(s) for diagram number 366 + // (none) + // Amplitude(s) for diagram number 366 + FFV1_0( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + FFV1_0( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + FFV1_0( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram367( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 367 OF 1240 *** + // Wavefunction(s) for diagram number 367 + // (none) + // Amplitude(s) for diagram number 367 + FFV1_0( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram368( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 368 OF 1240 *** + // Wavefunction(s) for diagram number 368 + // (none) + // Amplitude(s) for diagram number 368 + FFV1_0( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram369( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 369 OF 1240 *** + // Wavefunction(s) for diagram number 369 + // (none) + // Amplitude(s) for diagram number 369 + VVV1_0( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram370( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 370 OF 1240 *** + // Wavefunction(s) for diagram number 370 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] ); + FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 370 + FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram371( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 371 OF 1240 *** + // Wavefunction(s) for diagram number 371 + // (none) + // Amplitude(s) for diagram number 371 + FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram372( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 372 OF 1240 *** + // Wavefunction(s) for diagram number 372 + VVV1P0_1( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] ); + FFV1P0_3( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] ); + // Amplitude(s) for diagram number 372 + VVV1_0( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram373( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 373 OF 1240 *** + // Wavefunction(s) for diagram number 373 + // (none) + // Amplitude(s) for diagram number 373 + FFV1_0( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram374( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 374 OF 1240 *** + // Wavefunction(s) for diagram number 374 + VVV1P0_1( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 374 + VVV1_0( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram375( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 375 OF 1240 *** + // Wavefunction(s) for diagram number 375 + // (none) + // Amplitude(s) for diagram number 375 + FFV1_0( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram376( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 376 OF 1240 *** + // Wavefunction(s) for diagram number 376 + VVVV1P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] ); + VVVV3P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] ); + VVVV4P0_1( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] ); + // Amplitude(s) for diagram number 376 + FFV1_0( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram377( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 377 OF 1240 *** + // Wavefunction(s) for diagram number 377 + FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] ); + // Amplitude(s) for diagram number 377 + FFV1_0( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram378( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 378 OF 1240 *** + // Wavefunction(s) for diagram number 378 + FFV1_2( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + // Amplitude(s) for diagram number 378 + FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram379( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 379 OF 1240 *** + // Wavefunction(s) for diagram number 379 + // (none) + // Amplitude(s) for diagram number 379 + FFV1_0( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram380( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 380 OF 1240 *** + // Wavefunction(s) for diagram number 380 + // (none) + // Amplitude(s) for diagram number 380 + FFV1_0( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram381( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 381 OF 1240 *** + // Wavefunction(s) for diagram number 381 + FFV1_2( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] ); + // Amplitude(s) for diagram number 381 + FFV1_0( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram382( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 382 OF 1240 *** + // Wavefunction(s) for diagram number 382 + // (none) + // Amplitude(s) for diagram number 382 + FFV1_0( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram383( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 383 OF 1240 *** + // Wavefunction(s) for diagram number 383 + // (none) + // Amplitude(s) for diagram number 383 + FFV1_0( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram384( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 384 OF 1240 *** + // Wavefunction(s) for diagram number 384 + // (none) + // Amplitude(s) for diagram number 384 + FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram385( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 385 OF 1240 *** + // Wavefunction(s) for diagram number 385 + VVV1P0_1( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] ); + // Amplitude(s) for diagram number 385 + FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram386( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 386 OF 1240 *** + // Wavefunction(s) for diagram number 386 + FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); + // Amplitude(s) for diagram number 386 + FFV1_0( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram387( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 387 OF 1240 *** + // Wavefunction(s) for diagram number 387 + // (none) + // Amplitude(s) for diagram number 387 + FFV1_0( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram388( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 388 OF 1240 *** + // Wavefunction(s) for diagram number 388 + FFV1P0_3( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] ); + // Amplitude(s) for diagram number 388 + VVV1_0( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram389( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 389 OF 1240 *** + // Wavefunction(s) for diagram number 389 + // (none) + // Amplitude(s) for diagram number 389 + FFV1_0( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram390( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 390 OF 1240 *** + // Wavefunction(s) for diagram number 390 + // (none) + // Amplitude(s) for diagram number 390 + VVV1_0( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram391( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 391 OF 1240 *** + // Wavefunction(s) for diagram number 391 + // (none) + // Amplitude(s) for diagram number 391 + FFV1_0( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram392( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 392 OF 1240 *** + // Wavefunction(s) for diagram number 392 + // (none) + // Amplitude(s) for diagram number 392 + FFV1_0( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram393( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 393 OF 1240 *** + // Wavefunction(s) for diagram number 393 + FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); + // Amplitude(s) for diagram number 393 + FFV1_0( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram394( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 394 OF 1240 *** + // Wavefunction(s) for diagram number 394 + FFV1_1( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] ); + // Amplitude(s) for diagram number 394 + FFV1_0( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram395( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 395 OF 1240 *** + // Wavefunction(s) for diagram number 395 + // (none) + // Amplitude(s) for diagram number 395 + FFV1_0( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram396( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 396 OF 1240 *** + // Wavefunction(s) for diagram number 396 + // (none) + // Amplitude(s) for diagram number 396 + FFV1_0( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram397( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 397 OF 1240 *** + // Wavefunction(s) for diagram number 397 + FFV1_1( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); + // Amplitude(s) for diagram number 397 + FFV1_0( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram398( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 398 OF 1240 *** + // Wavefunction(s) for diagram number 398 + // (none) + // Amplitude(s) for diagram number 398 + FFV1_0( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram399( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 399 OF 1240 *** + // Wavefunction(s) for diagram number 399 + // (none) + // Amplitude(s) for diagram number 399 + FFV1_0( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram400( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 400 OF 1240 *** + // Wavefunction(s) for diagram number 400 + // (none) + // Amplitude(s) for diagram number 400 + FFV1_0( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram401( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 401 OF 1240 *** + // Wavefunction(s) for diagram number 401 + // (none) + // Amplitude(s) for diagram number 401 + FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram402( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 402 OF 1240 *** + // Wavefunction(s) for diagram number 402 + // (none) + // Amplitude(s) for diagram number 402 + FFV1_0( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram403( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 403 OF 1240 *** + // Wavefunction(s) for diagram number 403 + // (none) + // Amplitude(s) for diagram number 403 + FFV1_0( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram404( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 404 OF 1240 *** + // Wavefunction(s) for diagram number 404 + // (none) + // Amplitude(s) for diagram number 404 + FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram405( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 405 OF 1240 *** + // Wavefunction(s) for diagram number 405 + // (none) + // Amplitude(s) for diagram number 405 + FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram406( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 406 OF 1240 *** + // Wavefunction(s) for diagram number 406 + // (none) + // Amplitude(s) for diagram number 406 + FFV1_0( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram407( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 407 OF 1240 *** + // Wavefunction(s) for diagram number 407 + // (none) + // Amplitude(s) for diagram number 407 + FFV1_0( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram408( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 408 OF 1240 *** + // Wavefunction(s) for diagram number 408 + // (none) + // Amplitude(s) for diagram number 408 + VVVV1_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram409( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 409 OF 1240 *** + // Wavefunction(s) for diagram number 409 + VVV1P0_1( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 409 + VVV1_0( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram410( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 410 OF 1240 *** + // Wavefunction(s) for diagram number 410 + VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] ); + // Amplitude(s) for diagram number 410 + VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram411( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 411 OF 1240 *** + // Wavefunction(s) for diagram number 411 + // (none) + // Amplitude(s) for diagram number 411 + VVV1_0( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram412( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 412 OF 1240 *** + // Wavefunction(s) for diagram number 412 + // (none) + // Amplitude(s) for diagram number 412 + FFV1_0( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram413( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 413 OF 1240 *** + // Wavefunction(s) for diagram number 413 + // (none) + // Amplitude(s) for diagram number 413 + FFV1_0( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram414( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 414 OF 1240 *** + // Wavefunction(s) for diagram number 414 + // (none) + // Amplitude(s) for diagram number 414 + FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram415( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 415 OF 1240 *** + // Wavefunction(s) for diagram number 415 + // (none) + // Amplitude(s) for diagram number 415 + FFV1_0( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram416( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 416 OF 1240 *** + // Wavefunction(s) for diagram number 416 + // (none) + // Amplitude(s) for diagram number 416 + FFV1_0( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram417( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 417 OF 1240 *** + // Wavefunction(s) for diagram number 417 + // (none) + // Amplitude(s) for diagram number 417 + FFV1_0( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram418( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 418 OF 1240 *** + // Wavefunction(s) for diagram number 418 + // (none) + // Amplitude(s) for diagram number 418 + FFV1_0( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram419( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 419 OF 1240 *** + // Wavefunction(s) for diagram number 419 + // (none) + // Amplitude(s) for diagram number 419 + FFV1_0( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram420( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 420 OF 1240 *** + // Wavefunction(s) for diagram number 420 + // (none) + // Amplitude(s) for diagram number 420 + FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram421( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 421 OF 1240 *** + // Wavefunction(s) for diagram number 421 + // (none) + // Amplitude(s) for diagram number 421 + FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram422( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 422 OF 1240 *** + // Wavefunction(s) for diagram number 422 + // (none) + // Amplitude(s) for diagram number 422 + FFV1_0( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram423( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 423 OF 1240 *** + // Wavefunction(s) for diagram number 423 + // (none) + // Amplitude(s) for diagram number 423 + FFV1_0( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram424( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 424 OF 1240 *** + // Wavefunction(s) for diagram number 424 + // (none) + // Amplitude(s) for diagram number 424 + VVVV1_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram425( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 425 OF 1240 *** + // Wavefunction(s) for diagram number 425 + VVV1P0_1( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 425 + VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram426( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 426 OF 1240 *** + // Wavefunction(s) for diagram number 426 + // (none) + // Amplitude(s) for diagram number 426 + VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram427( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 427 OF 1240 *** + // Wavefunction(s) for diagram number 427 + // (none) + // Amplitude(s) for diagram number 427 + VVV1_0( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram428( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 428 OF 1240 *** + // Wavefunction(s) for diagram number 428 + // (none) + // Amplitude(s) for diagram number 428 + FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram429( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 429 OF 1240 *** + // Wavefunction(s) for diagram number 429 + // (none) + // Amplitude(s) for diagram number 429 + FFV1_0( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram430( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 430 OF 1240 *** + // Wavefunction(s) for diagram number 430 + // (none) + // Amplitude(s) for diagram number 430 + FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram431( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 431 OF 1240 *** + // Wavefunction(s) for diagram number 431 + // (none) + // Amplitude(s) for diagram number 431 + FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram432( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 432 OF 1240 *** + // Wavefunction(s) for diagram number 432 + // (none) + // Amplitude(s) for diagram number 432 + FFV1_0( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram433( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 433 OF 1240 *** + // Wavefunction(s) for diagram number 433 + // (none) + // Amplitude(s) for diagram number 433 + FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram434( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 434 OF 1240 *** + // Wavefunction(s) for diagram number 434 + VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 434 + VVV1_0( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram435( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 435 OF 1240 *** + // Wavefunction(s) for diagram number 435 + // (none) + // Amplitude(s) for diagram number 435 + VVV1_0( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram436( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 436 OF 1240 *** + // Wavefunction(s) for diagram number 436 + // (none) + // Amplitude(s) for diagram number 436 + VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram437( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 437 OF 1240 *** + // Wavefunction(s) for diagram number 437 + VVV1P0_1( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] ); + // Amplitude(s) for diagram number 437 + VVV1_0( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram438( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 438 OF 1240 *** + // Wavefunction(s) for diagram number 438 + // (none) + // Amplitude(s) for diagram number 438 + VVV1_0( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram439( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 439 OF 1240 *** + // Wavefunction(s) for diagram number 439 + // (none) + // Amplitude(s) for diagram number 439 + VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram440( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 440 OF 1240 *** + // Wavefunction(s) for diagram number 440 + // (none) + // Amplitude(s) for diagram number 440 + VVV1_0( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram441( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 441 OF 1240 *** + // Wavefunction(s) for diagram number 441 + // (none) + // Amplitude(s) for diagram number 441 + VVV1_0( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram442( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 442 OF 1240 *** + // Wavefunction(s) for diagram number 442 + // (none) + // Amplitude(s) for diagram number 442 + VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram443( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 443 OF 1240 *** + // Wavefunction(s) for diagram number 443 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 443 + VVV1_0( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram444( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 444 OF 1240 *** + // Wavefunction(s) for diagram number 444 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] ); + // Amplitude(s) for diagram number 444 + VVV1_0( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram445( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 445 OF 1240 *** + // Wavefunction(s) for diagram number 445 + // (none) + // Amplitude(s) for diagram number 445 + VVV1_0( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram446( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 446 OF 1240 *** + // Wavefunction(s) for diagram number 446 + // (none) + // Amplitude(s) for diagram number 446 + VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram447( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 447 OF 1240 *** + // Wavefunction(s) for diagram number 447 + // (none) + // Amplitude(s) for diagram number 447 + VVV1_0( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram448( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 448 OF 1240 *** + // Wavefunction(s) for diagram number 448 + // (none) + // Amplitude(s) for diagram number 448 + VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram449( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 449 OF 1240 *** + // Wavefunction(s) for diagram number 449 + // (none) + // Amplitude(s) for diagram number 449 + VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram450( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 450 OF 1240 *** + // Wavefunction(s) for diagram number 450 + // (none) + // Amplitude(s) for diagram number 450 + VVV1_0( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram451( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 451 OF 1240 *** + // Wavefunction(s) for diagram number 451 + // (none) + // Amplitude(s) for diagram number 451 + FFV1_0( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram452( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 452 OF 1240 *** + // Wavefunction(s) for diagram number 452 + // (none) + // Amplitude(s) for diagram number 452 + FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram453( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 453 OF 1240 *** + // Wavefunction(s) for diagram number 453 + // (none) + // Amplitude(s) for diagram number 453 + FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram454( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 454 OF 1240 *** + // Wavefunction(s) for diagram number 454 + // (none) + // Amplitude(s) for diagram number 454 + FFV1_0( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram455( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 455 OF 1240 *** + // Wavefunction(s) for diagram number 455 + // (none) + // Amplitude(s) for diagram number 455 + VVV1_0( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram456( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 456 OF 1240 *** + // Wavefunction(s) for diagram number 456 + // (none) + // Amplitude(s) for diagram number 456 + FFV1_0( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram457( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 457 OF 1240 *** + // Wavefunction(s) for diagram number 457 + // (none) + // Amplitude(s) for diagram number 457 + FFV1_0( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram458( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 458 OF 1240 *** + // Wavefunction(s) for diagram number 458 + // (none) + // Amplitude(s) for diagram number 458 + FFV1_0( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram459( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 459 OF 1240 *** + // Wavefunction(s) for diagram number 459 + // (none) + // Amplitude(s) for diagram number 459 + FFV1_0( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram460( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 460 OF 1240 *** + // Wavefunction(s) for diagram number 460 + // (none) + // Amplitude(s) for diagram number 460 + VVV1_0( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram461( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 461 OF 1240 *** + // Wavefunction(s) for diagram number 461 + // (none) + // Amplitude(s) for diagram number 461 + FFV1_0( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram462( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 462 OF 1240 *** + // Wavefunction(s) for diagram number 462 + // (none) + // Amplitude(s) for diagram number 462 + FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram463( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 463 OF 1240 *** + // Wavefunction(s) for diagram number 463 + // (none) + // Amplitude(s) for diagram number 463 + FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram464( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 464 OF 1240 *** + // Wavefunction(s) for diagram number 464 + // (none) + // Amplitude(s) for diagram number 464 + FFV1_0( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram465( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 465 OF 1240 *** + // Wavefunction(s) for diagram number 465 + // (none) + // Amplitude(s) for diagram number 465 + VVV1_0( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram466( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 466 OF 1240 *** + // Wavefunction(s) for diagram number 466 + // (none) + // Amplitude(s) for diagram number 466 + FFV1_0( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram467( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 467 OF 1240 *** + // Wavefunction(s) for diagram number 467 + // (none) + // Amplitude(s) for diagram number 467 + FFV1_0( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram468( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 468 OF 1240 *** + // Wavefunction(s) for diagram number 468 + // (none) + // Amplitude(s) for diagram number 468 + FFV1_0( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram469( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 469 OF 1240 *** + // Wavefunction(s) for diagram number 469 + // (none) + // Amplitude(s) for diagram number 469 + FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram470( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 470 OF 1240 *** + // Wavefunction(s) for diagram number 470 + // (none) + // Amplitude(s) for diagram number 470 + VVV1_0( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram471( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 471 OF 1240 *** + // Wavefunction(s) for diagram number 471 + // (none) + // Amplitude(s) for diagram number 471 + FFV1_0( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram472( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 472 OF 1240 *** + // Wavefunction(s) for diagram number 472 + // (none) + // Amplitude(s) for diagram number 472 + FFV1_0( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram473( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 473 OF 1240 *** + // Wavefunction(s) for diagram number 473 + // (none) + // Amplitude(s) for diagram number 473 + FFV1_0( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram474( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 474 OF 1240 *** + // Wavefunction(s) for diagram number 474 + // (none) + // Amplitude(s) for diagram number 474 + FFV1_0( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram475( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 475 OF 1240 *** + // Wavefunction(s) for diagram number 475 + // (none) + // Amplitude(s) for diagram number 475 + VVV1_0( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram476( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 476 OF 1240 *** + // Wavefunction(s) for diagram number 476 + // (none) + // Amplitude(s) for diagram number 476 + FFV1_0( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram477( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 477 OF 1240 *** + // Wavefunction(s) for diagram number 477 + // (none) + // Amplitude(s) for diagram number 477 + VVV1_0( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram478( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 478 OF 1240 *** + // Wavefunction(s) for diagram number 478 + // (none) + // Amplitude(s) for diagram number 478 + FFV1_0( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram479( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 479 OF 1240 *** + // Wavefunction(s) for diagram number 479 + // (none) + // Amplitude(s) for diagram number 479 + FFV1_0( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram480( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 480 OF 1240 *** + // Wavefunction(s) for diagram number 480 + // (none) + // Amplitude(s) for diagram number 480 + FFV1_0( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram481( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 481 OF 1240 *** + // Wavefunction(s) for diagram number 481 + // (none) + // Amplitude(s) for diagram number 481 + FFV1_0( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram482( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 482 OF 1240 *** + // Wavefunction(s) for diagram number 482 + // (none) + // Amplitude(s) for diagram number 482 + VVV1_0( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram483( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 483 OF 1240 *** + // Wavefunction(s) for diagram number 483 + // (none) + // Amplitude(s) for diagram number 483 + FFV1_0( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram484( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 484 OF 1240 *** + // Wavefunction(s) for diagram number 484 + // (none) + // Amplitude(s) for diagram number 484 + FFV1_0( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram485( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 485 OF 1240 *** + // Wavefunction(s) for diagram number 485 + // (none) + // Amplitude(s) for diagram number 485 + FFV1_0( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram486( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 486 OF 1240 *** + // Wavefunction(s) for diagram number 486 + // (none) + // Amplitude(s) for diagram number 486 + FFV1_0( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram487( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 487 OF 1240 *** + // Wavefunction(s) for diagram number 487 + // (none) + // Amplitude(s) for diagram number 487 + FFV1_0( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram488( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 488 OF 1240 *** + // Wavefunction(s) for diagram number 488 + // (none) + // Amplitude(s) for diagram number 488 + FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram489( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 489 OF 1240 *** + // Wavefunction(s) for diagram number 489 + // (none) + // Amplitude(s) for diagram number 489 + FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram490( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 490 OF 1240 *** + // Wavefunction(s) for diagram number 490 + // (none) + // Amplitude(s) for diagram number 490 + FFV1_0( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram491( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 491 OF 1240 *** + // Wavefunction(s) for diagram number 491 + // (none) + // Amplitude(s) for diagram number 491 + FFV1_0( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram492( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 492 OF 1240 *** + // Wavefunction(s) for diagram number 492 + // (none) + // Amplitude(s) for diagram number 492 + VVV1_0( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVV1_0( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram493( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 493 OF 1240 *** + // Wavefunction(s) for diagram number 493 + VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] ); + FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 493 + FFV1_0( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram494( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 494 OF 1240 *** + // Wavefunction(s) for diagram number 494 + // (none) + // Amplitude(s) for diagram number 494 + FFV1_0( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram495( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 495 OF 1240 *** + // Wavefunction(s) for diagram number 495 + VVV1P0_1( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] ); + // Amplitude(s) for diagram number 495 + VVV1_0( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram496( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 496 OF 1240 *** + // Wavefunction(s) for diagram number 496 + // (none) + // Amplitude(s) for diagram number 496 + FFV1_0( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram497( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 497 OF 1240 *** + // Wavefunction(s) for diagram number 497 + VVV1P0_1( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 497 + VVV1_0( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram498( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 498 OF 1240 *** + // Wavefunction(s) for diagram number 498 + // (none) + // Amplitude(s) for diagram number 498 + FFV1_0( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram499( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 499 OF 1240 *** + // Wavefunction(s) for diagram number 499 + VVVV1P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); + VVVV3P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] ); + VVVV4P0_1( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] ); + // Amplitude(s) for diagram number 499 + FFV1_0( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram500( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 500 OF 1240 *** + // Wavefunction(s) for diagram number 500 + FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); + // Amplitude(s) for diagram number 500 + FFV1_0( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram501( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 501 OF 1240 *** + // Wavefunction(s) for diagram number 501 + FFV1_2( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); + // Amplitude(s) for diagram number 501 + FFV1_0( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram502( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 502 OF 1240 *** + // Wavefunction(s) for diagram number 502 + // (none) + // Amplitude(s) for diagram number 502 + FFV1_0( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram503( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 503 OF 1240 *** + // Wavefunction(s) for diagram number 503 + // (none) + // Amplitude(s) for diagram number 503 + FFV1_0( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram504( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 504 OF 1240 *** + // Wavefunction(s) for diagram number 504 + FFV1_2( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] ); + // Amplitude(s) for diagram number 504 + FFV1_0( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram505( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 505 OF 1240 *** + // Wavefunction(s) for diagram number 505 + // (none) + // Amplitude(s) for diagram number 505 + FFV1_0( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram506( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 506 OF 1240 *** + // Wavefunction(s) for diagram number 506 + // (none) + // Amplitude(s) for diagram number 506 + FFV1_0( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram507( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 507 OF 1240 *** + // Wavefunction(s) for diagram number 507 + // (none) + // Amplitude(s) for diagram number 507 + FFV1_0( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram508( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 508 OF 1240 *** + // Wavefunction(s) for diagram number 508 + VVV1P0_1( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] ); + // Amplitude(s) for diagram number 508 + FFV1_0( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram509( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 509 OF 1240 *** + // Wavefunction(s) for diagram number 509 + FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] ); + // Amplitude(s) for diagram number 509 + FFV1_0( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram510( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 510 OF 1240 *** + // Wavefunction(s) for diagram number 510 + // (none) + // Amplitude(s) for diagram number 510 + FFV1_0( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram511( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 511 OF 1240 *** + // Wavefunction(s) for diagram number 511 + // (none) + // Amplitude(s) for diagram number 511 + VVV1_0( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram512( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 512 OF 1240 *** + // Wavefunction(s) for diagram number 512 + // (none) + // Amplitude(s) for diagram number 512 + FFV1_0( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram513( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 513 OF 1240 *** + // Wavefunction(s) for diagram number 513 + // (none) + // Amplitude(s) for diagram number 513 + VVV1_0( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram514( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 514 OF 1240 *** + // Wavefunction(s) for diagram number 514 + // (none) + // Amplitude(s) for diagram number 514 + FFV1_0( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram515( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 515 OF 1240 *** + // Wavefunction(s) for diagram number 515 + // (none) + // Amplitude(s) for diagram number 515 + FFV1_0( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram516( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 516 OF 1240 *** + // Wavefunction(s) for diagram number 516 + FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] ); + // Amplitude(s) for diagram number 516 + FFV1_0( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram517( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 517 OF 1240 *** + // Wavefunction(s) for diagram number 517 + FFV1_1( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + // Amplitude(s) for diagram number 517 + FFV1_0( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram518( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 518 OF 1240 *** + // Wavefunction(s) for diagram number 518 + // (none) + // Amplitude(s) for diagram number 518 + FFV1_0( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram519( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 519 OF 1240 *** + // Wavefunction(s) for diagram number 519 + // (none) + // Amplitude(s) for diagram number 519 + FFV1_0( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram520( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 520 OF 1240 *** + // Wavefunction(s) for diagram number 520 + FFV1_1( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); + // Amplitude(s) for diagram number 520 + FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram521( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 521 OF 1240 *** + // Wavefunction(s) for diagram number 521 + // (none) + // Amplitude(s) for diagram number 521 + FFV1_0( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram522( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 522 OF 1240 *** + // Wavefunction(s) for diagram number 522 + // (none) + // Amplitude(s) for diagram number 522 + FFV1_0( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram523( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 523 OF 1240 *** + // Wavefunction(s) for diagram number 523 + // (none) + // Amplitude(s) for diagram number 523 + FFV1_0( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram524( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 524 OF 1240 *** + // Wavefunction(s) for diagram number 524 + // (none) + // Amplitude(s) for diagram number 524 + FFV1_0( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram525( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 525 OF 1240 *** + // Wavefunction(s) for diagram number 525 + // (none) + // Amplitude(s) for diagram number 525 + FFV1_0( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram526( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 526 OF 1240 *** + // Wavefunction(s) for diagram number 526 + // (none) + // Amplitude(s) for diagram number 526 + FFV1_0( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram527( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 527 OF 1240 *** + // Wavefunction(s) for diagram number 527 + // (none) + // Amplitude(s) for diagram number 527 + FFV1_0( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram528( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 528 OF 1240 *** + // Wavefunction(s) for diagram number 528 + // (none) + // Amplitude(s) for diagram number 528 + FFV1_0( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram529( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 529 OF 1240 *** + // Wavefunction(s) for diagram number 529 + // (none) + // Amplitude(s) for diagram number 529 + FFV1_0( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram530( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 530 OF 1240 *** + // Wavefunction(s) for diagram number 530 + // (none) + // Amplitude(s) for diagram number 530 + FFV1_0( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram531( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 531 OF 1240 *** + // Wavefunction(s) for diagram number 531 + // (none) + // Amplitude(s) for diagram number 531 + VVVV1_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram532( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 532 OF 1240 *** + // Wavefunction(s) for diagram number 532 + VVV1P0_1( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 532 + VVV1_0( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram533( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 533 OF 1240 *** + // Wavefunction(s) for diagram number 533 + VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] ); + // Amplitude(s) for diagram number 533 + VVV1_0( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram534( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 534 OF 1240 *** + // Wavefunction(s) for diagram number 534 + // (none) + // Amplitude(s) for diagram number 534 + VVV1_0( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram535( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 535 OF 1240 *** + // Wavefunction(s) for diagram number 535 + // (none) + // Amplitude(s) for diagram number 535 + FFV1_0( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram536( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 536 OF 1240 *** + // Wavefunction(s) for diagram number 536 + // (none) + // Amplitude(s) for diagram number 536 + FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram537( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 537 OF 1240 *** + // Wavefunction(s) for diagram number 537 + // (none) + // Amplitude(s) for diagram number 537 + FFV1_0( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram538( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 538 OF 1240 *** + // Wavefunction(s) for diagram number 538 + // (none) + // Amplitude(s) for diagram number 538 + FFV1_0( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram539( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 539 OF 1240 *** + // Wavefunction(s) for diagram number 539 + // (none) + // Amplitude(s) for diagram number 539 + FFV1_0( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram540( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 540 OF 1240 *** + // Wavefunction(s) for diagram number 540 + // (none) + // Amplitude(s) for diagram number 540 + FFV1_0( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram541( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 541 OF 1240 *** + // Wavefunction(s) for diagram number 541 + // (none) + // Amplitude(s) for diagram number 541 + FFV1_0( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram542( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 542 OF 1240 *** + // Wavefunction(s) for diagram number 542 + // (none) + // Amplitude(s) for diagram number 542 + FFV1_0( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram543( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 543 OF 1240 *** + // Wavefunction(s) for diagram number 543 + // (none) + // Amplitude(s) for diagram number 543 + FFV1_0( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram544( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 544 OF 1240 *** + // Wavefunction(s) for diagram number 544 + // (none) + // Amplitude(s) for diagram number 544 + FFV1_0( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram545( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 545 OF 1240 *** + // Wavefunction(s) for diagram number 545 + // (none) + // Amplitude(s) for diagram number 545 + FFV1_0( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram546( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 546 OF 1240 *** + // Wavefunction(s) for diagram number 546 + // (none) + // Amplitude(s) for diagram number 546 + FFV1_0( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram547( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 547 OF 1240 *** + // Wavefunction(s) for diagram number 547 + // (none) + // Amplitude(s) for diagram number 547 + VVVV1_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram548( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 548 OF 1240 *** + // Wavefunction(s) for diagram number 548 + VVV1P0_1( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 548 + VVV1_0( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram549( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 549 OF 1240 *** + // Wavefunction(s) for diagram number 549 + // (none) + // Amplitude(s) for diagram number 549 + VVV1_0( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram550( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 550 OF 1240 *** + // Wavefunction(s) for diagram number 550 + // (none) + // Amplitude(s) for diagram number 550 + VVV1_0( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram551( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 551 OF 1240 *** + // Wavefunction(s) for diagram number 551 + // (none) + // Amplitude(s) for diagram number 551 + FFV1_0( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram552( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 552 OF 1240 *** + // Wavefunction(s) for diagram number 552 + // (none) + // Amplitude(s) for diagram number 552 + FFV1_0( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram553( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 553 OF 1240 *** + // Wavefunction(s) for diagram number 553 + // (none) + // Amplitude(s) for diagram number 553 + FFV1_0( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram554( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 554 OF 1240 *** + // Wavefunction(s) for diagram number 554 + // (none) + // Amplitude(s) for diagram number 554 + FFV1_0( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram555( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 555 OF 1240 *** + // Wavefunction(s) for diagram number 555 + // (none) + // Amplitude(s) for diagram number 555 + FFV1_0( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram556( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 556 OF 1240 *** + // Wavefunction(s) for diagram number 556 + // (none) + // Amplitude(s) for diagram number 556 + FFV1_0( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram557( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 557 OF 1240 *** + // Wavefunction(s) for diagram number 557 + VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 557 + VVV1_0( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram558( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 558 OF 1240 *** + // Wavefunction(s) for diagram number 558 + // (none) + // Amplitude(s) for diagram number 558 + VVV1_0( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram559( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 559 OF 1240 *** + // Wavefunction(s) for diagram number 559 + // (none) + // Amplitude(s) for diagram number 559 + VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram560( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 560 OF 1240 *** + // Wavefunction(s) for diagram number 560 + // (none) + // Amplitude(s) for diagram number 560 + VVV1_0( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram561( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 561 OF 1240 *** + // Wavefunction(s) for diagram number 561 + // (none) + // Amplitude(s) for diagram number 561 + VVV1_0( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram562( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 562 OF 1240 *** + // Wavefunction(s) for diagram number 562 + // (none) + // Amplitude(s) for diagram number 562 + VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram563( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 563 OF 1240 *** + // Wavefunction(s) for diagram number 563 + // (none) + // Amplitude(s) for diagram number 563 + VVV1_0( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram564( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 564 OF 1240 *** + // Wavefunction(s) for diagram number 564 + // (none) + // Amplitude(s) for diagram number 564 + VVV1_0( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram565( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 565 OF 1240 *** + // Wavefunction(s) for diagram number 565 + // (none) + // Amplitude(s) for diagram number 565 + VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram566( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 566 OF 1240 *** + // Wavefunction(s) for diagram number 566 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] ); + // Amplitude(s) for diagram number 566 + VVV1_0( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram567( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 567 OF 1240 *** + // Wavefunction(s) for diagram number 567 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] ); + // Amplitude(s) for diagram number 567 + VVV1_0( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram568( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 568 OF 1240 *** + // Wavefunction(s) for diagram number 568 + // (none) + // Amplitude(s) for diagram number 568 + VVV1_0( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram569( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 569 OF 1240 *** + // Wavefunction(s) for diagram number 569 + // (none) + // Amplitude(s) for diagram number 569 + VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram570( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 570 OF 1240 *** + // Wavefunction(s) for diagram number 570 + // (none) + // Amplitude(s) for diagram number 570 + VVV1_0( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram571( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 571 OF 1240 *** + // Wavefunction(s) for diagram number 571 + // (none) + // Amplitude(s) for diagram number 571 + VVV1_0( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram572( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 572 OF 1240 *** + // Wavefunction(s) for diagram number 572 + // (none) + // Amplitude(s) for diagram number 572 + VVV1_0( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram573( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 573 OF 1240 *** + // Wavefunction(s) for diagram number 573 + // (none) + // Amplitude(s) for diagram number 573 + VVV1_0( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram574( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 574 OF 1240 *** + // Wavefunction(s) for diagram number 574 + // (none) + // Amplitude(s) for diagram number 574 + FFV1_0( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram575( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 575 OF 1240 *** + // Wavefunction(s) for diagram number 575 + // (none) + // Amplitude(s) for diagram number 575 + FFV1_0( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram576( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 576 OF 1240 *** + // Wavefunction(s) for diagram number 576 + // (none) + // Amplitude(s) for diagram number 576 + FFV1_0( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram577( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 577 OF 1240 *** + // Wavefunction(s) for diagram number 577 + // (none) + // Amplitude(s) for diagram number 577 + FFV1_0( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram578( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 578 OF 1240 *** + // Wavefunction(s) for diagram number 578 + // (none) + // Amplitude(s) for diagram number 578 + VVV1_0( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram579( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 579 OF 1240 *** + // Wavefunction(s) for diagram number 579 + // (none) + // Amplitude(s) for diagram number 579 + FFV1_0( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram580( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 580 OF 1240 *** + // Wavefunction(s) for diagram number 580 + // (none) + // Amplitude(s) for diagram number 580 + FFV1_0( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram581( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 581 OF 1240 *** + // Wavefunction(s) for diagram number 581 + // (none) + // Amplitude(s) for diagram number 581 + FFV1_0( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram582( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 582 OF 1240 *** + // Wavefunction(s) for diagram number 582 + // (none) + // Amplitude(s) for diagram number 582 + FFV1_0( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram583( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 583 OF 1240 *** + // Wavefunction(s) for diagram number 583 + // (none) + // Amplitude(s) for diagram number 583 + VVV1_0( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram584( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 584 OF 1240 *** + // Wavefunction(s) for diagram number 584 + // (none) + // Amplitude(s) for diagram number 584 + FFV1_0( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram585( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 585 OF 1240 *** + // Wavefunction(s) for diagram number 585 + // (none) + // Amplitude(s) for diagram number 585 + FFV1_0( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram586( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 586 OF 1240 *** + // Wavefunction(s) for diagram number 586 + // (none) + // Amplitude(s) for diagram number 586 + FFV1_0( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram587( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 587 OF 1240 *** + // Wavefunction(s) for diagram number 587 + // (none) + // Amplitude(s) for diagram number 587 + FFV1_0( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram588( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 588 OF 1240 *** + // Wavefunction(s) for diagram number 588 + // (none) + // Amplitude(s) for diagram number 588 + VVV1_0( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram589( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 589 OF 1240 *** + // Wavefunction(s) for diagram number 589 + // (none) + // Amplitude(s) for diagram number 589 + FFV1_0( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram590( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 590 OF 1240 *** + // Wavefunction(s) for diagram number 590 + // (none) + // Amplitude(s) for diagram number 590 + FFV1_0( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram591( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 591 OF 1240 *** + // Wavefunction(s) for diagram number 591 + // (none) + // Amplitude(s) for diagram number 591 + FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram592( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 592 OF 1240 *** + // Wavefunction(s) for diagram number 592 + // (none) + // Amplitude(s) for diagram number 592 + FFV1_0( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram593( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 593 OF 1240 *** + // Wavefunction(s) for diagram number 593 + // (none) + // Amplitude(s) for diagram number 593 + VVV1_0( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram594( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 594 OF 1240 *** + // Wavefunction(s) for diagram number 594 + // (none) + // Amplitude(s) for diagram number 594 + FFV1_0( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram595( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 595 OF 1240 *** + // Wavefunction(s) for diagram number 595 + // (none) + // Amplitude(s) for diagram number 595 + FFV1_0( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram596( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 596 OF 1240 *** + // Wavefunction(s) for diagram number 596 + // (none) + // Amplitude(s) for diagram number 596 + FFV1_0( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram597( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 597 OF 1240 *** + // Wavefunction(s) for diagram number 597 + // (none) + // Amplitude(s) for diagram number 597 + FFV1_0( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram598( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 598 OF 1240 *** + // Wavefunction(s) for diagram number 598 + // (none) + // Amplitude(s) for diagram number 598 + VVV1_0( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram599( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 599 OF 1240 *** + // Wavefunction(s) for diagram number 599 + // (none) + // Amplitude(s) for diagram number 599 + FFV1_0( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram600( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 600 OF 1240 *** + // Wavefunction(s) for diagram number 600 + // (none) + // Amplitude(s) for diagram number 600 + VVV1_0( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram601( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 601 OF 1240 *** + // Wavefunction(s) for diagram number 601 + // (none) + // Amplitude(s) for diagram number 601 + FFV1_0( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram602( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 602 OF 1240 *** + // Wavefunction(s) for diagram number 602 + // (none) + // Amplitude(s) for diagram number 602 + FFV1_0( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram603( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 603 OF 1240 *** + // Wavefunction(s) for diagram number 603 + // (none) + // Amplitude(s) for diagram number 603 + FFV1_0( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram604( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 604 OF 1240 *** + // Wavefunction(s) for diagram number 604 + // (none) + // Amplitude(s) for diagram number 604 + FFV1_0( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram605( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 605 OF 1240 *** + // Wavefunction(s) for diagram number 605 + // (none) + // Amplitude(s) for diagram number 605 + VVV1_0( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram606( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 606 OF 1240 *** + // Wavefunction(s) for diagram number 606 + // (none) + // Amplitude(s) for diagram number 606 + FFV1_0( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram607( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 607 OF 1240 *** + // Wavefunction(s) for diagram number 607 + // (none) + // Amplitude(s) for diagram number 607 + FFV1_0( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram608( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 608 OF 1240 *** + // Wavefunction(s) for diagram number 608 + // (none) + // Amplitude(s) for diagram number 608 + FFV1_0( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram609( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 609 OF 1240 *** + // Wavefunction(s) for diagram number 609 + // (none) + // Amplitude(s) for diagram number 609 + FFV1_0( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram610( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 610 OF 1240 *** + // Wavefunction(s) for diagram number 610 + // (none) + // Amplitude(s) for diagram number 610 + FFV1_0( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram611( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 611 OF 1240 *** + // Wavefunction(s) for diagram number 611 + // (none) + // Amplitude(s) for diagram number 611 + FFV1_0( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram612( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 612 OF 1240 *** + // Wavefunction(s) for diagram number 612 + // (none) + // Amplitude(s) for diagram number 612 + FFV1_0( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram613( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 613 OF 1240 *** + // Wavefunction(s) for diagram number 613 + // (none) + // Amplitude(s) for diagram number 613 + FFV1_0( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram614( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 614 OF 1240 *** + // Wavefunction(s) for diagram number 614 + // (none) + // Amplitude(s) for diagram number 614 + FFV1_0( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram615( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 615 OF 1240 *** + // Wavefunction(s) for diagram number 615 + // (none) + // Amplitude(s) for diagram number 615 + VVV1_0( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVV1_0( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVV1_0( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram616( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 616 OF 1240 *** + // Wavefunction(s) for diagram number 616 + VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] ); + FFV1_2( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 616 + FFV1_0( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram617( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 617 OF 1240 *** + // Wavefunction(s) for diagram number 617 + // (none) + // Amplitude(s) for diagram number 617 + FFV1_0( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram618( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 618 OF 1240 *** + // Wavefunction(s) for diagram number 618 + VVV1P0_1( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] ); + // Amplitude(s) for diagram number 618 + VVV1_0( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram619( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 619 OF 1240 *** + // Wavefunction(s) for diagram number 619 + // (none) + // Amplitude(s) for diagram number 619 + FFV1_0( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram620( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 620 OF 1240 *** + // Wavefunction(s) for diagram number 620 + VVV1P0_1( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 620 + VVV1_0( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram621( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 621 OF 1240 *** + // Wavefunction(s) for diagram number 621 + // (none) + // Amplitude(s) for diagram number 621 + FFV1_0( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram622( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 622 OF 1240 *** + // Wavefunction(s) for diagram number 622 + VVVV1P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] ); + VVVV3P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] ); + VVVV4P0_1( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] ); + // Amplitude(s) for diagram number 622 + FFV1_0( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram623( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 623 OF 1240 *** + // Wavefunction(s) for diagram number 623 + FFV1_1( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); + // Amplitude(s) for diagram number 623 + FFV1_0( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram624( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 624 OF 1240 *** + // Wavefunction(s) for diagram number 624 + FFV1_2( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] ); + // Amplitude(s) for diagram number 624 + FFV1_0( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram625( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 625 OF 1240 *** + // Wavefunction(s) for diagram number 625 + // (none) + // Amplitude(s) for diagram number 625 + FFV1_0( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram626( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 626 OF 1240 *** + // Wavefunction(s) for diagram number 626 + // (none) + // Amplitude(s) for diagram number 626 + FFV1_0( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram627( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 627 OF 1240 *** + // Wavefunction(s) for diagram number 627 + FFV1_2( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); + // Amplitude(s) for diagram number 627 + FFV1_0( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram628( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 628 OF 1240 *** + // Wavefunction(s) for diagram number 628 + // (none) + // Amplitude(s) for diagram number 628 + FFV1_0( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram629( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 629 OF 1240 *** + // Wavefunction(s) for diagram number 629 + // (none) + // Amplitude(s) for diagram number 629 + FFV1_0( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram630( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 630 OF 1240 *** + // Wavefunction(s) for diagram number 630 + // (none) + // Amplitude(s) for diagram number 630 + FFV1_0( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram631( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 631 OF 1240 *** + // Wavefunction(s) for diagram number 631 + VVV1P0_1( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] ); + // Amplitude(s) for diagram number 631 + FFV1_0( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram632( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 632 OF 1240 *** + // Wavefunction(s) for diagram number 632 + FFV1_1( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] ); + // Amplitude(s) for diagram number 632 + FFV1_0( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram633( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 633 OF 1240 *** + // Wavefunction(s) for diagram number 633 + // (none) + // Amplitude(s) for diagram number 633 + FFV1_0( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram634( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 634 OF 1240 *** + // Wavefunction(s) for diagram number 634 + // (none) + // Amplitude(s) for diagram number 634 + VVV1_0( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram635( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 635 OF 1240 *** + // Wavefunction(s) for diagram number 635 + // (none) + // Amplitude(s) for diagram number 635 + FFV1_0( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram636( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 636 OF 1240 *** + // Wavefunction(s) for diagram number 636 + // (none) + // Amplitude(s) for diagram number 636 + VVV1_0( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram637( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 637 OF 1240 *** + // Wavefunction(s) for diagram number 637 + // (none) + // Amplitude(s) for diagram number 637 + FFV1_0( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram638( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 638 OF 1240 *** + // Wavefunction(s) for diagram number 638 + // (none) + // Amplitude(s) for diagram number 638 + FFV1_0( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram639( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 639 OF 1240 *** + // Wavefunction(s) for diagram number 639 + FFV1_2( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); + // Amplitude(s) for diagram number 639 + FFV1_0( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram640( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 640 OF 1240 *** + // Wavefunction(s) for diagram number 640 + FFV1_1( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); + // Amplitude(s) for diagram number 640 + FFV1_0( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram641( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 641 OF 1240 *** + // Wavefunction(s) for diagram number 641 + // (none) + // Amplitude(s) for diagram number 641 + FFV1_0( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram642( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 642 OF 1240 *** + // Wavefunction(s) for diagram number 642 + // (none) + // Amplitude(s) for diagram number 642 + FFV1_0( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram643( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 643 OF 1240 *** + // Wavefunction(s) for diagram number 643 + FFV1_1( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] ); + // Amplitude(s) for diagram number 643 + FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram644( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 644 OF 1240 *** + // Wavefunction(s) for diagram number 644 + // (none) + // Amplitude(s) for diagram number 644 + FFV1_0( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram645( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 645 OF 1240 *** + // Wavefunction(s) for diagram number 645 + // (none) + // Amplitude(s) for diagram number 645 + FFV1_0( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram646( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 646 OF 1240 *** + // Wavefunction(s) for diagram number 646 + // (none) + // Amplitude(s) for diagram number 646 + FFV1_0( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram647( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 647 OF 1240 *** + // Wavefunction(s) for diagram number 647 + // (none) + // Amplitude(s) for diagram number 647 + FFV1_0( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram648( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 648 OF 1240 *** + // Wavefunction(s) for diagram number 648 + // (none) + // Amplitude(s) for diagram number 648 + FFV1_0( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram649( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 649 OF 1240 *** + // Wavefunction(s) for diagram number 649 + // (none) + // Amplitude(s) for diagram number 649 + FFV1_0( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram650( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 650 OF 1240 *** + // Wavefunction(s) for diagram number 650 + // (none) + // Amplitude(s) for diagram number 650 + FFV1_0( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram651( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 651 OF 1240 *** + // Wavefunction(s) for diagram number 651 + // (none) + // Amplitude(s) for diagram number 651 + FFV1_0( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram652( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 652 OF 1240 *** + // Wavefunction(s) for diagram number 652 + // (none) + // Amplitude(s) for diagram number 652 + FFV1_0( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram653( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 653 OF 1240 *** + // Wavefunction(s) for diagram number 653 + // (none) + // Amplitude(s) for diagram number 653 + FFV1_0( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram654( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 654 OF 1240 *** + // Wavefunction(s) for diagram number 654 + // (none) + // Amplitude(s) for diagram number 654 + VVVV1_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram655( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 655 OF 1240 *** + // Wavefunction(s) for diagram number 655 + VVV1P0_1( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 655 + VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram656( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 656 OF 1240 *** + // Wavefunction(s) for diagram number 656 + VVV1P0_1( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] ); + // Amplitude(s) for diagram number 656 + VVV1_0( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram657( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 657 OF 1240 *** + // Wavefunction(s) for diagram number 657 + // (none) + // Amplitude(s) for diagram number 657 + VVV1_0( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram658( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 658 OF 1240 *** + // Wavefunction(s) for diagram number 658 + // (none) + // Amplitude(s) for diagram number 658 + FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram659( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 659 OF 1240 *** + // Wavefunction(s) for diagram number 659 + // (none) + // Amplitude(s) for diagram number 659 + FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram660( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 660 OF 1240 *** + // Wavefunction(s) for diagram number 660 + // (none) + // Amplitude(s) for diagram number 660 + FFV1_0( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram661( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 661 OF 1240 *** + // Wavefunction(s) for diagram number 661 + // (none) + // Amplitude(s) for diagram number 661 + FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram662( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 662 OF 1240 *** + // Wavefunction(s) for diagram number 662 + // (none) + // Amplitude(s) for diagram number 662 + FFV1_0( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram663( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 663 OF 1240 *** + // Wavefunction(s) for diagram number 663 + // (none) + // Amplitude(s) for diagram number 663 + FFV1_0( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram664( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 664 OF 1240 *** + // Wavefunction(s) for diagram number 664 + // (none) + // Amplitude(s) for diagram number 664 + FFV1_0( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram665( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 665 OF 1240 *** + // Wavefunction(s) for diagram number 665 + // (none) + // Amplitude(s) for diagram number 665 + FFV1_0( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram666( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 666 OF 1240 *** + // Wavefunction(s) for diagram number 666 + // (none) + // Amplitude(s) for diagram number 666 + FFV1_0( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram667( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 667 OF 1240 *** + // Wavefunction(s) for diagram number 667 + // (none) + // Amplitude(s) for diagram number 667 + FFV1_0( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram668( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 668 OF 1240 *** + // Wavefunction(s) for diagram number 668 + // (none) + // Amplitude(s) for diagram number 668 + FFV1_0( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram669( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 669 OF 1240 *** + // Wavefunction(s) for diagram number 669 + // (none) + // Amplitude(s) for diagram number 669 + FFV1_0( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram670( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 670 OF 1240 *** + // Wavefunction(s) for diagram number 670 + // (none) + // Amplitude(s) for diagram number 670 + VVVV1_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram671( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 671 OF 1240 *** + // Wavefunction(s) for diagram number 671 + VVV1P0_1( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 671 + VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram672( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 672 OF 1240 *** + // Wavefunction(s) for diagram number 672 + // (none) + // Amplitude(s) for diagram number 672 + VVV1_0( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram673( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 673 OF 1240 *** + // Wavefunction(s) for diagram number 673 + // (none) + // Amplitude(s) for diagram number 673 + VVV1_0( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram674( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 674 OF 1240 *** + // Wavefunction(s) for diagram number 674 + // (none) + // Amplitude(s) for diagram number 674 + FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram675( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 675 OF 1240 *** + // Wavefunction(s) for diagram number 675 + // (none) + // Amplitude(s) for diagram number 675 + FFV1_0( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram676( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 676 OF 1240 *** + // Wavefunction(s) for diagram number 676 + // (none) + // Amplitude(s) for diagram number 676 + FFV1_0( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram677( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 677 OF 1240 *** + // Wavefunction(s) for diagram number 677 + // (none) + // Amplitude(s) for diagram number 677 + FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram678( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 678 OF 1240 *** + // Wavefunction(s) for diagram number 678 + // (none) + // Amplitude(s) for diagram number 678 + FFV1_0( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram679( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 679 OF 1240 *** + // Wavefunction(s) for diagram number 679 + // (none) + // Amplitude(s) for diagram number 679 + FFV1_0( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram680( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 680 OF 1240 *** + // Wavefunction(s) for diagram number 680 + VVV1P0_1( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] ); + // Amplitude(s) for diagram number 680 + VVV1_0( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram681( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 681 OF 1240 *** + // Wavefunction(s) for diagram number 681 + // (none) + // Amplitude(s) for diagram number 681 + VVV1_0( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram682( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 682 OF 1240 *** + // Wavefunction(s) for diagram number 682 + // (none) + // Amplitude(s) for diagram number 682 + VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram683( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 683 OF 1240 *** + // Wavefunction(s) for diagram number 683 + // (none) + // Amplitude(s) for diagram number 683 + VVV1_0( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram684( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 684 OF 1240 *** + // Wavefunction(s) for diagram number 684 + // (none) + // Amplitude(s) for diagram number 684 + VVV1_0( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram685( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 685 OF 1240 *** + // Wavefunction(s) for diagram number 685 + // (none) + // Amplitude(s) for diagram number 685 + VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram686( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 686 OF 1240 *** + // Wavefunction(s) for diagram number 686 + // (none) + // Amplitude(s) for diagram number 686 + VVV1_0( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram687( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 687 OF 1240 *** + // Wavefunction(s) for diagram number 687 + // (none) + // Amplitude(s) for diagram number 687 + VVV1_0( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram688( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 688 OF 1240 *** + // Wavefunction(s) for diagram number 688 + // (none) + // Amplitude(s) for diagram number 688 + VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram689( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 689 OF 1240 *** + // Wavefunction(s) for diagram number 689 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] ); + // Amplitude(s) for diagram number 689 + VVV1_0( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram690( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 690 OF 1240 *** + // Wavefunction(s) for diagram number 690 + VVVV1P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); + VVVV3P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); + VVVV4P0_1( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 690 + VVV1_0( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram691( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 691 OF 1240 *** + // Wavefunction(s) for diagram number 691 + // (none) + // Amplitude(s) for diagram number 691 + VVV1_0( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram692( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 692 OF 1240 *** + // Wavefunction(s) for diagram number 692 + // (none) + // Amplitude(s) for diagram number 692 + VVVV1_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + VVVV3_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV4_0( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram693( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 693 OF 1240 *** + // Wavefunction(s) for diagram number 693 + // (none) + // Amplitude(s) for diagram number 693 + VVV1_0( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram694( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 694 OF 1240 *** + // Wavefunction(s) for diagram number 694 + // (none) + // Amplitude(s) for diagram number 694 + VVV1_0( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram695( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 695 OF 1240 *** + // Wavefunction(s) for diagram number 695 + // (none) + // Amplitude(s) for diagram number 695 + VVV1_0( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram696( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 696 OF 1240 *** + // Wavefunction(s) for diagram number 696 + // (none) + // Amplitude(s) for diagram number 696 + VVV1_0( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram697( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 697 OF 1240 *** + // Wavefunction(s) for diagram number 697 + // (none) + // Amplitude(s) for diagram number 697 + FFV1_0( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram698( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 698 OF 1240 *** + // Wavefunction(s) for diagram number 698 + // (none) + // Amplitude(s) for diagram number 698 + FFV1_0( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram699( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 699 OF 1240 *** + // Wavefunction(s) for diagram number 699 + // (none) + // Amplitude(s) for diagram number 699 + FFV1_0( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram700( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 700 OF 1240 *** + // Wavefunction(s) for diagram number 700 + // (none) + // Amplitude(s) for diagram number 700 + FFV1_0( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram701( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 701 OF 1240 *** + // Wavefunction(s) for diagram number 701 + // (none) + // Amplitude(s) for diagram number 701 + VVV1_0( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram702( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 702 OF 1240 *** + // Wavefunction(s) for diagram number 702 + // (none) + // Amplitude(s) for diagram number 702 + FFV1_0( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram703( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 703 OF 1240 *** + // Wavefunction(s) for diagram number 703 + // (none) + // Amplitude(s) for diagram number 703 + FFV1_0( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram704( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 704 OF 1240 *** + // Wavefunction(s) for diagram number 704 + // (none) + // Amplitude(s) for diagram number 704 + FFV1_0( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram705( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 705 OF 1240 *** + // Wavefunction(s) for diagram number 705 + // (none) + // Amplitude(s) for diagram number 705 + FFV1_0( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram706( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 706 OF 1240 *** + // Wavefunction(s) for diagram number 706 + // (none) + // Amplitude(s) for diagram number 706 + VVV1_0( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram707( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 707 OF 1240 *** + // Wavefunction(s) for diagram number 707 + // (none) + // Amplitude(s) for diagram number 707 + FFV1_0( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram708( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 708 OF 1240 *** + // Wavefunction(s) for diagram number 708 + // (none) + // Amplitude(s) for diagram number 708 + FFV1_0( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram709( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 709 OF 1240 *** + // Wavefunction(s) for diagram number 709 + // (none) + // Amplitude(s) for diagram number 709 + FFV1_0( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram710( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 710 OF 1240 *** + // Wavefunction(s) for diagram number 710 + // (none) + // Amplitude(s) for diagram number 710 + FFV1_0( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram711( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 711 OF 1240 *** + // Wavefunction(s) for diagram number 711 + // (none) + // Amplitude(s) for diagram number 711 + VVV1_0( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram712( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 712 OF 1240 *** + // Wavefunction(s) for diagram number 712 + // (none) + // Amplitude(s) for diagram number 712 + FFV1_0( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram713( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 713 OF 1240 *** + // Wavefunction(s) for diagram number 713 + // (none) + // Amplitude(s) for diagram number 713 + FFV1_0( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram714( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 714 OF 1240 *** + // Wavefunction(s) for diagram number 714 + // (none) + // Amplitude(s) for diagram number 714 + FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram715( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 715 OF 1240 *** + // Wavefunction(s) for diagram number 715 + // (none) + // Amplitude(s) for diagram number 715 + FFV1_0( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram716( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 716 OF 1240 *** + // Wavefunction(s) for diagram number 716 + // (none) + // Amplitude(s) for diagram number 716 + VVV1_0( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram717( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 717 OF 1240 *** + // Wavefunction(s) for diagram number 717 + // (none) + // Amplitude(s) for diagram number 717 + FFV1_0( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram718( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 718 OF 1240 *** + // Wavefunction(s) for diagram number 718 + // (none) + // Amplitude(s) for diagram number 718 + FFV1_0( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram719( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 719 OF 1240 *** + // Wavefunction(s) for diagram number 719 + // (none) + // Amplitude(s) for diagram number 719 + FFV1_0( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram720( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 720 OF 1240 *** + // Wavefunction(s) for diagram number 720 + // (none) + // Amplitude(s) for diagram number 720 + FFV1_0( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram721( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 721 OF 1240 *** + // Wavefunction(s) for diagram number 721 + // (none) + // Amplitude(s) for diagram number 721 + VVV1_0( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram722( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 722 OF 1240 *** + // Wavefunction(s) for diagram number 722 + // (none) + // Amplitude(s) for diagram number 722 + FFV1_0( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram723( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 723 OF 1240 *** + // Wavefunction(s) for diagram number 723 + // (none) + // Amplitude(s) for diagram number 723 + VVV1_0( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram724( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 724 OF 1240 *** + // Wavefunction(s) for diagram number 724 + // (none) + // Amplitude(s) for diagram number 724 + FFV1_0( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram725( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 725 OF 1240 *** + // Wavefunction(s) for diagram number 725 + // (none) + // Amplitude(s) for diagram number 725 + FFV1_0( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram726( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 726 OF 1240 *** + // Wavefunction(s) for diagram number 726 + // (none) + // Amplitude(s) for diagram number 726 + FFV1_0( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram727( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 727 OF 1240 *** + // Wavefunction(s) for diagram number 727 + // (none) + // Amplitude(s) for diagram number 727 + FFV1_0( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram728( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 728 OF 1240 *** + // Wavefunction(s) for diagram number 728 + // (none) + // Amplitude(s) for diagram number 728 + VVV1_0( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram729( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 729 OF 1240 *** + // Wavefunction(s) for diagram number 729 + // (none) + // Amplitude(s) for diagram number 729 + FFV1_0( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram730( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 730 OF 1240 *** + // Wavefunction(s) for diagram number 730 + // (none) + // Amplitude(s) for diagram number 730 + FFV1_0( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram731( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 731 OF 1240 *** + // Wavefunction(s) for diagram number 731 + // (none) + // Amplitude(s) for diagram number 731 + FFV1_0( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram732( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 732 OF 1240 *** + // Wavefunction(s) for diagram number 732 + // (none) + // Amplitude(s) for diagram number 732 + FFV1_0( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram733( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 733 OF 1240 *** + // Wavefunction(s) for diagram number 733 + // (none) + // Amplitude(s) for diagram number 733 + FFV1_0( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram734( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 734 OF 1240 *** + // Wavefunction(s) for diagram number 734 + // (none) + // Amplitude(s) for diagram number 734 + FFV1_0( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram735( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 735 OF 1240 *** + // Wavefunction(s) for diagram number 735 + // (none) + // Amplitude(s) for diagram number 735 + FFV1_0( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram736( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 736 OF 1240 *** + // Wavefunction(s) for diagram number 736 + // (none) + // Amplitude(s) for diagram number 736 + FFV1_0( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram737( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 737 OF 1240 *** + // Wavefunction(s) for diagram number 737 + // (none) + // Amplitude(s) for diagram number 737 + FFV1_0( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram738( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 738 OF 1240 *** + // Wavefunction(s) for diagram number 738 + // (none) + // Amplitude(s) for diagram number 738 + VVV1_0( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVV1_0( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram739( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 739 OF 1240 *** + // Wavefunction(s) for diagram number 739 + FFV1_1( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] ); + // Amplitude(s) for diagram number 739 + FFV1_0( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram740( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 740 OF 1240 *** + // Wavefunction(s) for diagram number 740 + // (none) + // Amplitude(s) for diagram number 740 + FFV1_0( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram741( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 741 OF 1240 *** + // Wavefunction(s) for diagram number 741 + FFV1_2( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] ); + // Amplitude(s) for diagram number 741 + FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram742( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 742 OF 1240 *** + // Wavefunction(s) for diagram number 742 + // (none) + // Amplitude(s) for diagram number 742 + FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram743( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 743 OF 1240 *** + // Wavefunction(s) for diagram number 743 + // (none) + // Amplitude(s) for diagram number 743 + FFV1_0( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram744( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 744 OF 1240 *** + // Wavefunction(s) for diagram number 744 + // (none) + // Amplitude(s) for diagram number 744 + FFV1_0( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram745( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 745 OF 1240 *** + // Wavefunction(s) for diagram number 745 + // (none) + // Amplitude(s) for diagram number 745 + FFV1_0( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram746( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 746 OF 1240 *** + // Wavefunction(s) for diagram number 746 + // (none) + // Amplitude(s) for diagram number 746 + FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram747( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 747 OF 1240 *** + // Wavefunction(s) for diagram number 747 + VVV1P0_1( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] ); + // Amplitude(s) for diagram number 747 + FFV1_0( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram748( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 748 OF 1240 *** + // Wavefunction(s) for diagram number 748 + // (none) + // Amplitude(s) for diagram number 748 + FFV1_0( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram749( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 749 OF 1240 *** + // Wavefunction(s) for diagram number 749 + // (none) + // Amplitude(s) for diagram number 749 + FFV1_0( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram750( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 750 OF 1240 *** + // Wavefunction(s) for diagram number 750 + FFV1_2( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] ); + // Amplitude(s) for diagram number 750 + FFV1_0( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram751( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 751 OF 1240 *** + // Wavefunction(s) for diagram number 751 + // (none) + // Amplitude(s) for diagram number 751 + FFV1_0( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram752( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 752 OF 1240 *** + // Wavefunction(s) for diagram number 752 + // (none) + // Amplitude(s) for diagram number 752 + FFV1_0( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram753( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 753 OF 1240 *** + // Wavefunction(s) for diagram number 753 + // (none) + // Amplitude(s) for diagram number 753 + FFV1_0( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram754( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 754 OF 1240 *** + // Wavefunction(s) for diagram number 754 + // (none) + // Amplitude(s) for diagram number 754 + FFV1_0( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram755( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 755 OF 1240 *** + // Wavefunction(s) for diagram number 755 + // (none) + // Amplitude(s) for diagram number 755 + FFV1_0( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram756( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 756 OF 1240 *** + // Wavefunction(s) for diagram number 756 + VVV1P0_1( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] ); + // Amplitude(s) for diagram number 756 + FFV1_0( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram757( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 757 OF 1240 *** + // Wavefunction(s) for diagram number 757 + // (none) + // Amplitude(s) for diagram number 757 + FFV1_0( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram758( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 758 OF 1240 *** + // Wavefunction(s) for diagram number 758 + // (none) + // Amplitude(s) for diagram number 758 + FFV1_0( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram759( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 759 OF 1240 *** + // Wavefunction(s) for diagram number 759 + FFV1_2( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] ); + // Amplitude(s) for diagram number 759 + FFV1_0( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram760( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 760 OF 1240 *** + // Wavefunction(s) for diagram number 760 + // (none) + // Amplitude(s) for diagram number 760 + FFV1_0( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram761( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 761 OF 1240 *** + // Wavefunction(s) for diagram number 761 + // (none) + // Amplitude(s) for diagram number 761 + FFV1_0( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram762( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 762 OF 1240 *** + // Wavefunction(s) for diagram number 762 + // (none) + // Amplitude(s) for diagram number 762 + FFV1_0( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram763( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 763 OF 1240 *** + // Wavefunction(s) for diagram number 763 + // (none) + // Amplitude(s) for diagram number 763 + FFV1_0( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram764( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 764 OF 1240 *** + // Wavefunction(s) for diagram number 764 + // (none) + // Amplitude(s) for diagram number 764 + FFV1_0( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram765( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 765 OF 1240 *** + // Wavefunction(s) for diagram number 765 + VVV1P0_1( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] ); + // Amplitude(s) for diagram number 765 + FFV1_0( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram766( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 766 OF 1240 *** + // Wavefunction(s) for diagram number 766 + // (none) + // Amplitude(s) for diagram number 766 + FFV1_0( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram767( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 767 OF 1240 *** + // Wavefunction(s) for diagram number 767 + // (none) + // Amplitude(s) for diagram number 767 + FFV1_0( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram768( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 768 OF 1240 *** + // Wavefunction(s) for diagram number 768 + // (none) + // Amplitude(s) for diagram number 768 + VVV1_0( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram769( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 769 OF 1240 *** + // Wavefunction(s) for diagram number 769 + // (none) + // Amplitude(s) for diagram number 769 + FFV1_0( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram770( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 770 OF 1240 *** + // Wavefunction(s) for diagram number 770 + // (none) + // Amplitude(s) for diagram number 770 + VVV1_0( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram771( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 771 OF 1240 *** + // Wavefunction(s) for diagram number 771 + // (none) + // Amplitude(s) for diagram number 771 + FFV1_0( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram772( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 772 OF 1240 *** + // Wavefunction(s) for diagram number 772 + VVVV1P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] ); + VVVV3P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] ); + VVVV4P0_1( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 772 + FFV1_0( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram773( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 773 OF 1240 *** + // Wavefunction(s) for diagram number 773 + // (none) + // Amplitude(s) for diagram number 773 + FFV1_0( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram774( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 774 OF 1240 *** + // Wavefunction(s) for diagram number 774 + // (none) + // Amplitude(s) for diagram number 774 + FFV1_0( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram775( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 775 OF 1240 *** + // Wavefunction(s) for diagram number 775 + // (none) + // Amplitude(s) for diagram number 775 + VVV1_0( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram776( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 776 OF 1240 *** + // Wavefunction(s) for diagram number 776 + // (none) + // Amplitude(s) for diagram number 776 + FFV1_0( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram777( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 777 OF 1240 *** + // Wavefunction(s) for diagram number 777 + // (none) + // Amplitude(s) for diagram number 777 + VVV1_0( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram778( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 778 OF 1240 *** + // Wavefunction(s) for diagram number 778 + // (none) + // Amplitude(s) for diagram number 778 + FFV1_0( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram779( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 779 OF 1240 *** + // Wavefunction(s) for diagram number 779 + VVVV1P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] ); + VVVV3P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] ); + VVVV4P0_1( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] ); + // Amplitude(s) for diagram number 779 + FFV1_0( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram780( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 780 OF 1240 *** + // Wavefunction(s) for diagram number 780 + // (none) + // Amplitude(s) for diagram number 780 + FFV1_0( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram781( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 781 OF 1240 *** + // Wavefunction(s) for diagram number 781 + // (none) + // Amplitude(s) for diagram number 781 + FFV1_0( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram782( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 782 OF 1240 *** + // Wavefunction(s) for diagram number 782 + // (none) + // Amplitude(s) for diagram number 782 + VVV1_0( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram783( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 783 OF 1240 *** + // Wavefunction(s) for diagram number 783 + // (none) + // Amplitude(s) for diagram number 783 + FFV1_0( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram784( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 784 OF 1240 *** + // Wavefunction(s) for diagram number 784 + // (none) + // Amplitude(s) for diagram number 784 + VVV1_0( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram785( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 785 OF 1240 *** + // Wavefunction(s) for diagram number 785 + // (none) + // Amplitude(s) for diagram number 785 + FFV1_0( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram786( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 786 OF 1240 *** + // Wavefunction(s) for diagram number 786 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] ); + // Amplitude(s) for diagram number 786 + FFV1_0( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram787( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 787 OF 1240 *** + // Wavefunction(s) for diagram number 787 + // (none) + // Amplitude(s) for diagram number 787 + FFV1_0( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram788( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 788 OF 1240 *** + // Wavefunction(s) for diagram number 788 + VVV1P0_1( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] ); + VVV1P0_1( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] ); + VVV1P0_1( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] ); + // Amplitude(s) for diagram number 788 + FFV1_0( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram789( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 789 OF 1240 *** + // Wavefunction(s) for diagram number 789 + FFV1_2( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] ); + // Amplitude(s) for diagram number 789 + FFV1_0( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram790( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 790 OF 1240 *** + // Wavefunction(s) for diagram number 790 + // (none) + // Amplitude(s) for diagram number 790 + FFV1_0( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram791( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 791 OF 1240 *** + // Wavefunction(s) for diagram number 791 + FFV1_1( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] ); + // Amplitude(s) for diagram number 791 + FFV1_0( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram792( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 792 OF 1240 *** + // Wavefunction(s) for diagram number 792 + // (none) + // Amplitude(s) for diagram number 792 + FFV1_0( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram793( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 793 OF 1240 *** + // Wavefunction(s) for diagram number 793 + // (none) + // Amplitude(s) for diagram number 793 + FFV1_0( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram794( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 794 OF 1240 *** + // Wavefunction(s) for diagram number 794 + // (none) + // Amplitude(s) for diagram number 794 + FFV1_0( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram795( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 795 OF 1240 *** + // Wavefunction(s) for diagram number 795 + // (none) + // Amplitude(s) for diagram number 795 + FFV1_0( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram796( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 796 OF 1240 *** + // Wavefunction(s) for diagram number 796 + // (none) + // Amplitude(s) for diagram number 796 + FFV1_0( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram797( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 797 OF 1240 *** + // Wavefunction(s) for diagram number 797 + // (none) + // Amplitude(s) for diagram number 797 + FFV1_0( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram798( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 798 OF 1240 *** + // Wavefunction(s) for diagram number 798 + // (none) + // Amplitude(s) for diagram number 798 + FFV1_0( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram799( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 799 OF 1240 *** + // Wavefunction(s) for diagram number 799 + // (none) + // Amplitude(s) for diagram number 799 + FFV1_0( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram800( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 800 OF 1240 *** + // Wavefunction(s) for diagram number 800 + FFV1_1( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] ); + // Amplitude(s) for diagram number 800 + FFV1_0( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram801( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 801 OF 1240 *** + // Wavefunction(s) for diagram number 801 + // (none) + // Amplitude(s) for diagram number 801 + FFV1_0( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram802( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 802 OF 1240 *** + // Wavefunction(s) for diagram number 802 + // (none) + // Amplitude(s) for diagram number 802 + FFV1_0( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram803( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 803 OF 1240 *** + // Wavefunction(s) for diagram number 803 + // (none) + // Amplitude(s) for diagram number 803 + FFV1_0( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram804( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 804 OF 1240 *** + // Wavefunction(s) for diagram number 804 + // (none) + // Amplitude(s) for diagram number 804 + FFV1_0( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram805( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 805 OF 1240 *** + // Wavefunction(s) for diagram number 805 + // (none) + // Amplitude(s) for diagram number 805 + FFV1_0( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram806( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 806 OF 1240 *** + // Wavefunction(s) for diagram number 806 + // (none) + // Amplitude(s) for diagram number 806 + FFV1_0( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram807( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 807 OF 1240 *** + // Wavefunction(s) for diagram number 807 + // (none) + // Amplitude(s) for diagram number 807 + FFV1_0( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram808( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 808 OF 1240 *** + // Wavefunction(s) for diagram number 808 + // (none) + // Amplitude(s) for diagram number 808 + FFV1_0( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram809( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 809 OF 1240 *** + // Wavefunction(s) for diagram number 809 + FFV1_1( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] ); + // Amplitude(s) for diagram number 809 + FFV1_0( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram810( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 810 OF 1240 *** + // Wavefunction(s) for diagram number 810 + // (none) + // Amplitude(s) for diagram number 810 + FFV1_0( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram811( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 811 OF 1240 *** + // Wavefunction(s) for diagram number 811 + // (none) + // Amplitude(s) for diagram number 811 + FFV1_0( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram812( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 812 OF 1240 *** + // Wavefunction(s) for diagram number 812 + // (none) + // Amplitude(s) for diagram number 812 + FFV1_0( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram813( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 813 OF 1240 *** + // Wavefunction(s) for diagram number 813 + // (none) + // Amplitude(s) for diagram number 813 + FFV1_0( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram814( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 814 OF 1240 *** + // Wavefunction(s) for diagram number 814 + // (none) + // Amplitude(s) for diagram number 814 + FFV1_0( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram815( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 815 OF 1240 *** + // Wavefunction(s) for diagram number 815 + // (none) + // Amplitude(s) for diagram number 815 + FFV1_0( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram816( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 816 OF 1240 *** + // Wavefunction(s) for diagram number 816 + // (none) + // Amplitude(s) for diagram number 816 + FFV1_0( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram817( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 817 OF 1240 *** + // Wavefunction(s) for diagram number 817 + // (none) + // Amplitude(s) for diagram number 817 + FFV1_0( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram818( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 818 OF 1240 *** + // Wavefunction(s) for diagram number 818 + // (none) + // Amplitude(s) for diagram number 818 + VVV1_0( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram819( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 819 OF 1240 *** + // Wavefunction(s) for diagram number 819 + // (none) + // Amplitude(s) for diagram number 819 + FFV1_0( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram820( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 820 OF 1240 *** + // Wavefunction(s) for diagram number 820 + // (none) + // Amplitude(s) for diagram number 820 + VVV1_0( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram821( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 821 OF 1240 *** + // Wavefunction(s) for diagram number 821 + // (none) + // Amplitude(s) for diagram number 821 + FFV1_0( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram822( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 822 OF 1240 *** + // Wavefunction(s) for diagram number 822 + // (none) + // Amplitude(s) for diagram number 822 + FFV1_0( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram823( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 823 OF 1240 *** + // Wavefunction(s) for diagram number 823 + // (none) + // Amplitude(s) for diagram number 823 + FFV1_0( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram824( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 824 OF 1240 *** + // Wavefunction(s) for diagram number 824 + // (none) + // Amplitude(s) for diagram number 824 + FFV1_0( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram825( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 825 OF 1240 *** + // Wavefunction(s) for diagram number 825 + // (none) + // Amplitude(s) for diagram number 825 + VVV1_0( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram826( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 826 OF 1240 *** + // Wavefunction(s) for diagram number 826 + // (none) + // Amplitude(s) for diagram number 826 + FFV1_0( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram827( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 827 OF 1240 *** + // Wavefunction(s) for diagram number 827 + // (none) + // Amplitude(s) for diagram number 827 + VVV1_0( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram828( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 828 OF 1240 *** + // Wavefunction(s) for diagram number 828 + // (none) + // Amplitude(s) for diagram number 828 + FFV1_0( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram829( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 829 OF 1240 *** + // Wavefunction(s) for diagram number 829 + // (none) + // Amplitude(s) for diagram number 829 + FFV1_0( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram830( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 830 OF 1240 *** + // Wavefunction(s) for diagram number 830 + // (none) + // Amplitude(s) for diagram number 830 + FFV1_0( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram831( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 831 OF 1240 *** + // Wavefunction(s) for diagram number 831 + // (none) + // Amplitude(s) for diagram number 831 + FFV1_0( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram832( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 832 OF 1240 *** + // Wavefunction(s) for diagram number 832 + // (none) + // Amplitude(s) for diagram number 832 + VVV1_0( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram833( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 833 OF 1240 *** + // Wavefunction(s) for diagram number 833 + // (none) + // Amplitude(s) for diagram number 833 + FFV1_0( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram834( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 834 OF 1240 *** + // Wavefunction(s) for diagram number 834 + // (none) + // Amplitude(s) for diagram number 834 + VVV1_0( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram835( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 835 OF 1240 *** + // Wavefunction(s) for diagram number 835 + // (none) + // Amplitude(s) for diagram number 835 + FFV1_0( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram836( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 836 OF 1240 *** + // Wavefunction(s) for diagram number 836 + // (none) + // Amplitude(s) for diagram number 836 + FFV1_0( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram837( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 837 OF 1240 *** + // Wavefunction(s) for diagram number 837 + // (none) + // Amplitude(s) for diagram number 837 + FFV1_0( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + FFV1_0( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + FFV1_0( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram838( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 838 OF 1240 *** + // Wavefunction(s) for diagram number 838 + // (none) + // Amplitude(s) for diagram number 838 + FFV1_0( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram839( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 839 OF 1240 *** + // Wavefunction(s) for diagram number 839 + VVV1P0_1( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] ); + // Amplitude(s) for diagram number 839 + VVV1_0( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram840( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 840 OF 1240 *** + // Wavefunction(s) for diagram number 840 + // (none) + // Amplitude(s) for diagram number 840 + VVV1_0( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram841( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 841 OF 1240 *** + // Wavefunction(s) for diagram number 841 + // (none) + // Amplitude(s) for diagram number 841 + VVVV1_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram842( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 842 OF 1240 *** + // Wavefunction(s) for diagram number 842 + VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] ); + // Amplitude(s) for diagram number 842 + VVV1_0( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram843( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 843 OF 1240 *** + // Wavefunction(s) for diagram number 843 + // (none) + // Amplitude(s) for diagram number 843 + VVV1_0( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram844( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 844 OF 1240 *** + // Wavefunction(s) for diagram number 844 + // (none) + // Amplitude(s) for diagram number 844 + VVVV1_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVVV4_0( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram845( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 845 OF 1240 *** + // Wavefunction(s) for diagram number 845 + // (none) + // Amplitude(s) for diagram number 845 + VVV1_0( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram846( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 846 OF 1240 *** + // Wavefunction(s) for diagram number 846 + // (none) + // Amplitude(s) for diagram number 846 + VVV1_0( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram847( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 847 OF 1240 *** + // Wavefunction(s) for diagram number 847 + VVVV1P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] ); + VVVV3P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] ); + VVVV4P0_1( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 847 + VVV1_0( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram848( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 848 OF 1240 *** + // Wavefunction(s) for diagram number 848 + VVVV1P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] ); + VVVV3P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] ); + VVVV4P0_1( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] ); + // Amplitude(s) for diagram number 848 + VVV1_0( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram849( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 849 OF 1240 *** + // Wavefunction(s) for diagram number 849 + VVVV1P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] ); + VVVV3P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] ); + VVVV4P0_1( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] ); + // Amplitude(s) for diagram number 849 + VVV1_0( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVV1_0( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVV1_0( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram850( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 850 OF 1240 *** + // Wavefunction(s) for diagram number 850 + VVVV1P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] ); + VVVV3P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] ); + VVVV4P0_1( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] ); + // Amplitude(s) for diagram number 850 + VVV1_0( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVV1_0( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram851( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 851 OF 1240 *** + // Wavefunction(s) for diagram number 851 + // (none) + // Amplitude(s) for diagram number 851 + VVVV1_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram852( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 852 OF 1240 *** + // Wavefunction(s) for diagram number 852 + // (none) + // Amplitude(s) for diagram number 852 + VVV1_0( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram853( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 853 OF 1240 *** + // Wavefunction(s) for diagram number 853 + // (none) + // Amplitude(s) for diagram number 853 + VVV1_0( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram854( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 854 OF 1240 *** + // Wavefunction(s) for diagram number 854 + // (none) + // Amplitude(s) for diagram number 854 + VVV1_0( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram855( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 855 OF 1240 *** + // Wavefunction(s) for diagram number 855 + // (none) + // Amplitude(s) for diagram number 855 + VVV1_0( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram856( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 856 OF 1240 *** + // Wavefunction(s) for diagram number 856 + // (none) + // Amplitude(s) for diagram number 856 + FFV1_0( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram857( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 857 OF 1240 *** + // Wavefunction(s) for diagram number 857 + // (none) + // Amplitude(s) for diagram number 857 + FFV1_0( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram858( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 858 OF 1240 *** + // Wavefunction(s) for diagram number 858 + // (none) + // Amplitude(s) for diagram number 858 + FFV1_0( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram859( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 859 OF 1240 *** + // Wavefunction(s) for diagram number 859 + // (none) + // Amplitude(s) for diagram number 859 + FFV1_0( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram860( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 860 OF 1240 *** + // Wavefunction(s) for diagram number 860 + // (none) + // Amplitude(s) for diagram number 860 + VVV1_0( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram861( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 861 OF 1240 *** + // Wavefunction(s) for diagram number 861 + // (none) + // Amplitude(s) for diagram number 861 + FFV1_0( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram862( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 862 OF 1240 *** + // Wavefunction(s) for diagram number 862 + // (none) + // Amplitude(s) for diagram number 862 + FFV1_0( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram863( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 863 OF 1240 *** + // Wavefunction(s) for diagram number 863 + // (none) + // Amplitude(s) for diagram number 863 + FFV1_0( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram864( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 864 OF 1240 *** + // Wavefunction(s) for diagram number 864 + // (none) + // Amplitude(s) for diagram number 864 + FFV1_0( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram865( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 865 OF 1240 *** + // Wavefunction(s) for diagram number 865 + // (none) + // Amplitude(s) for diagram number 865 + VVV1_0( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram866( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 866 OF 1240 *** + // Wavefunction(s) for diagram number 866 + // (none) + // Amplitude(s) for diagram number 866 + FFV1_0( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram867( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 867 OF 1240 *** + // Wavefunction(s) for diagram number 867 + // (none) + // Amplitude(s) for diagram number 867 + FFV1_0( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram868( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 868 OF 1240 *** + // Wavefunction(s) for diagram number 868 + // (none) + // Amplitude(s) for diagram number 868 + FFV1_0( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram869( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 869 OF 1240 *** + // Wavefunction(s) for diagram number 869 + // (none) + // Amplitude(s) for diagram number 869 + FFV1_0( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram870( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 870 OF 1240 *** + // Wavefunction(s) for diagram number 870 + // (none) + // Amplitude(s) for diagram number 870 + VVV1_0( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram871( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 871 OF 1240 *** + // Wavefunction(s) for diagram number 871 + // (none) + // Amplitude(s) for diagram number 871 + FFV1_0( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram872( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 872 OF 1240 *** + // Wavefunction(s) for diagram number 872 + // (none) + // Amplitude(s) for diagram number 872 + FFV1_0( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram873( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 873 OF 1240 *** + // Wavefunction(s) for diagram number 873 + // (none) + // Amplitude(s) for diagram number 873 + FFV1_0( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram874( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 874 OF 1240 *** + // Wavefunction(s) for diagram number 874 + // (none) + // Amplitude(s) for diagram number 874 + FFV1_0( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram875( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 875 OF 1240 *** + // Wavefunction(s) for diagram number 875 + // (none) + // Amplitude(s) for diagram number 875 + VVV1_0( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram876( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 876 OF 1240 *** + // Wavefunction(s) for diagram number 876 + // (none) + // Amplitude(s) for diagram number 876 + FFV1_0( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram877( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 877 OF 1240 *** + // Wavefunction(s) for diagram number 877 + // (none) + // Amplitude(s) for diagram number 877 + FFV1_0( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram878( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 878 OF 1240 *** + // Wavefunction(s) for diagram number 878 + // (none) + // Amplitude(s) for diagram number 878 + FFV1_0( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram879( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 879 OF 1240 *** + // Wavefunction(s) for diagram number 879 + // (none) + // Amplitude(s) for diagram number 879 + FFV1_0( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram880( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 880 OF 1240 *** + // Wavefunction(s) for diagram number 880 + // (none) + // Amplitude(s) for diagram number 880 + VVV1_0( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram881( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 881 OF 1240 *** + // Wavefunction(s) for diagram number 881 + // (none) + // Amplitude(s) for diagram number 881 + FFV1_0( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram882( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 882 OF 1240 *** + // Wavefunction(s) for diagram number 882 + // (none) + // Amplitude(s) for diagram number 882 + VVV1_0( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram883( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 883 OF 1240 *** + // Wavefunction(s) for diagram number 883 + // (none) + // Amplitude(s) for diagram number 883 + FFV1_0( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram884( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 884 OF 1240 *** + // Wavefunction(s) for diagram number 884 + // (none) + // Amplitude(s) for diagram number 884 + FFV1_0( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram885( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 885 OF 1240 *** + // Wavefunction(s) for diagram number 885 + // (none) + // Amplitude(s) for diagram number 885 + FFV1_0( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram886( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 886 OF 1240 *** + // Wavefunction(s) for diagram number 886 + // (none) + // Amplitude(s) for diagram number 886 + FFV1_0( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram887( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 887 OF 1240 *** + // Wavefunction(s) for diagram number 887 + // (none) + // Amplitude(s) for diagram number 887 + VVV1_0( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram888( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 888 OF 1240 *** + // Wavefunction(s) for diagram number 888 + // (none) + // Amplitude(s) for diagram number 888 + FFV1_0( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram889( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 889 OF 1240 *** + // Wavefunction(s) for diagram number 889 + // (none) + // Amplitude(s) for diagram number 889 + FFV1_0( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram890( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 890 OF 1240 *** + // Wavefunction(s) for diagram number 890 + // (none) + // Amplitude(s) for diagram number 890 + FFV1_0( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram891( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 891 OF 1240 *** + // Wavefunction(s) for diagram number 891 + // (none) + // Amplitude(s) for diagram number 891 + FFV1_0( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram892( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 892 OF 1240 *** + // Wavefunction(s) for diagram number 892 + // (none) + // Amplitude(s) for diagram number 892 + FFV1_0( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram893( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 893 OF 1240 *** + // Wavefunction(s) for diagram number 893 + // (none) + // Amplitude(s) for diagram number 893 + FFV1_0( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram894( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 894 OF 1240 *** + // Wavefunction(s) for diagram number 894 + // (none) + // Amplitude(s) for diagram number 894 + FFV1_0( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram895( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 895 OF 1240 *** + // Wavefunction(s) for diagram number 895 + VVV1P0_1( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] ); + // Amplitude(s) for diagram number 895 + VVV1_0( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram896( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 896 OF 1240 *** + // Wavefunction(s) for diagram number 896 + // (none) + // Amplitude(s) for diagram number 896 + VVV1_0( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram897( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 897 OF 1240 *** + // Wavefunction(s) for diagram number 897 + // (none) + // Amplitude(s) for diagram number 897 + VVVV1_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram898( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 898 OF 1240 *** + // Wavefunction(s) for diagram number 898 + // (none) + // Amplitude(s) for diagram number 898 + VVV1_0( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram899( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 899 OF 1240 *** + // Wavefunction(s) for diagram number 899 + // (none) + // Amplitude(s) for diagram number 899 + VVV1_0( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram900( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 900 OF 1240 *** + // Wavefunction(s) for diagram number 900 + // (none) + // Amplitude(s) for diagram number 900 + VVVV1_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV3_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV4_0( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram901( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 901 OF 1240 *** + // Wavefunction(s) for diagram number 901 + // (none) + // Amplitude(s) for diagram number 901 + VVV1_0( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram902( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 902 OF 1240 *** + // Wavefunction(s) for diagram number 902 + // (none) + // Amplitude(s) for diagram number 902 + VVV1_0( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram903( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 903 OF 1240 *** + // Wavefunction(s) for diagram number 903 + VVVV1P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] ); + VVVV3P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] ); + VVVV4P0_1( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 903 + VVV1_0( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram904( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 904 OF 1240 *** + // Wavefunction(s) for diagram number 904 + VVVV1P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] ); + VVVV3P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] ); + VVVV4P0_1( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] ); + // Amplitude(s) for diagram number 904 + VVV1_0( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram905( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 905 OF 1240 *** + // Wavefunction(s) for diagram number 905 + VVVV1P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] ); + VVVV3P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] ); + VVVV4P0_1( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] ); + // Amplitude(s) for diagram number 905 + VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVV1_0( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram906( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 906 OF 1240 *** + // Wavefunction(s) for diagram number 906 + // (none) + // Amplitude(s) for diagram number 906 + VVV1_0( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVV1_0( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + VVV1_0( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram907( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 907 OF 1240 *** + // Wavefunction(s) for diagram number 907 + // (none) + // Amplitude(s) for diagram number 907 + VVVV1_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram908( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 908 OF 1240 *** + // Wavefunction(s) for diagram number 908 + // (none) + // Amplitude(s) for diagram number 908 + VVV1_0( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram909( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 909 OF 1240 *** + // Wavefunction(s) for diagram number 909 + // (none) + // Amplitude(s) for diagram number 909 + VVV1_0( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram910( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 910 OF 1240 *** + // Wavefunction(s) for diagram number 910 + // (none) + // Amplitude(s) for diagram number 910 + VVV1_0( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram911( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 911 OF 1240 *** + // Wavefunction(s) for diagram number 911 + // (none) + // Amplitude(s) for diagram number 911 + VVV1_0( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram912( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 912 OF 1240 *** + // Wavefunction(s) for diagram number 912 + // (none) + // Amplitude(s) for diagram number 912 + FFV1_0( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram913( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 913 OF 1240 *** + // Wavefunction(s) for diagram number 913 + // (none) + // Amplitude(s) for diagram number 913 + FFV1_0( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram914( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 914 OF 1240 *** + // Wavefunction(s) for diagram number 914 + // (none) + // Amplitude(s) for diagram number 914 + FFV1_0( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram915( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 915 OF 1240 *** + // Wavefunction(s) for diagram number 915 + // (none) + // Amplitude(s) for diagram number 915 + FFV1_0( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram916( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 916 OF 1240 *** + // Wavefunction(s) for diagram number 916 + // (none) + // Amplitude(s) for diagram number 916 + VVV1_0( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram917( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 917 OF 1240 *** + // Wavefunction(s) for diagram number 917 + // (none) + // Amplitude(s) for diagram number 917 + FFV1_0( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram918( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 918 OF 1240 *** + // Wavefunction(s) for diagram number 918 + // (none) + // Amplitude(s) for diagram number 918 + FFV1_0( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram919( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 919 OF 1240 *** + // Wavefunction(s) for diagram number 919 + // (none) + // Amplitude(s) for diagram number 919 + FFV1_0( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram920( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 920 OF 1240 *** + // Wavefunction(s) for diagram number 920 + // (none) + // Amplitude(s) for diagram number 920 + FFV1_0( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram921( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 921 OF 1240 *** + // Wavefunction(s) for diagram number 921 + // (none) + // Amplitude(s) for diagram number 921 + VVV1_0( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram922( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 922 OF 1240 *** + // Wavefunction(s) for diagram number 922 + // (none) + // Amplitude(s) for diagram number 922 + FFV1_0( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram923( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 923 OF 1240 *** + // Wavefunction(s) for diagram number 923 + // (none) + // Amplitude(s) for diagram number 923 + FFV1_0( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram924( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 924 OF 1240 *** + // Wavefunction(s) for diagram number 924 + // (none) + // Amplitude(s) for diagram number 924 + FFV1_0( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram925( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 925 OF 1240 *** + // Wavefunction(s) for diagram number 925 + // (none) + // Amplitude(s) for diagram number 925 + FFV1_0( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram926( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 926 OF 1240 *** + // Wavefunction(s) for diagram number 926 + // (none) + // Amplitude(s) for diagram number 926 + VVV1_0( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram927( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 927 OF 1240 *** + // Wavefunction(s) for diagram number 927 + // (none) + // Amplitude(s) for diagram number 927 + FFV1_0( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram928( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 928 OF 1240 *** + // Wavefunction(s) for diagram number 928 + // (none) + // Amplitude(s) for diagram number 928 + FFV1_0( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram929( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 929 OF 1240 *** + // Wavefunction(s) for diagram number 929 + // (none) + // Amplitude(s) for diagram number 929 + FFV1_0( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram930( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 930 OF 1240 *** + // Wavefunction(s) for diagram number 930 + // (none) + // Amplitude(s) for diagram number 930 + FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram931( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 931 OF 1240 *** + // Wavefunction(s) for diagram number 931 + // (none) + // Amplitude(s) for diagram number 931 + VVV1_0( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram932( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 932 OF 1240 *** + // Wavefunction(s) for diagram number 932 + // (none) + // Amplitude(s) for diagram number 932 + FFV1_0( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram933( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 933 OF 1240 *** + // Wavefunction(s) for diagram number 933 + // (none) + // Amplitude(s) for diagram number 933 + FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram934( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 934 OF 1240 *** + // Wavefunction(s) for diagram number 934 + // (none) + // Amplitude(s) for diagram number 934 + FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram935( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 935 OF 1240 *** + // Wavefunction(s) for diagram number 935 + // (none) + // Amplitude(s) for diagram number 935 + FFV1_0( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram936( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 936 OF 1240 *** + // Wavefunction(s) for diagram number 936 + // (none) + // Amplitude(s) for diagram number 936 + VVV1_0( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram937( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 937 OF 1240 *** + // Wavefunction(s) for diagram number 937 + // (none) + // Amplitude(s) for diagram number 937 + FFV1_0( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram938( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 938 OF 1240 *** + // Wavefunction(s) for diagram number 938 + // (none) + // Amplitude(s) for diagram number 938 + VVV1_0( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram939( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 939 OF 1240 *** + // Wavefunction(s) for diagram number 939 + // (none) + // Amplitude(s) for diagram number 939 + FFV1_0( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram940( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 940 OF 1240 *** + // Wavefunction(s) for diagram number 940 + // (none) + // Amplitude(s) for diagram number 940 + FFV1_0( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram941( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 941 OF 1240 *** + // Wavefunction(s) for diagram number 941 + // (none) + // Amplitude(s) for diagram number 941 + FFV1_0( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram942( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 942 OF 1240 *** + // Wavefunction(s) for diagram number 942 + // (none) + // Amplitude(s) for diagram number 942 + FFV1_0( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram943( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 943 OF 1240 *** + // Wavefunction(s) for diagram number 943 + // (none) + // Amplitude(s) for diagram number 943 + VVV1_0( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram944( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 944 OF 1240 *** + // Wavefunction(s) for diagram number 944 + // (none) + // Amplitude(s) for diagram number 944 + FFV1_0( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram945( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 945 OF 1240 *** + // Wavefunction(s) for diagram number 945 + // (none) + // Amplitude(s) for diagram number 945 + FFV1_0( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram946( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 946 OF 1240 *** + // Wavefunction(s) for diagram number 946 + // (none) + // Amplitude(s) for diagram number 946 + FFV1_0( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram947( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 947 OF 1240 *** + // Wavefunction(s) for diagram number 947 + // (none) + // Amplitude(s) for diagram number 947 + FFV1_0( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram948( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 948 OF 1240 *** + // Wavefunction(s) for diagram number 948 + // (none) + // Amplitude(s) for diagram number 948 + FFV1_0( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram949( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 949 OF 1240 *** + // Wavefunction(s) for diagram number 949 + // (none) + // Amplitude(s) for diagram number 949 + FFV1_0( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram950( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 950 OF 1240 *** + // Wavefunction(s) for diagram number 950 + // (none) + // Amplitude(s) for diagram number 950 + FFV1_0( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram951( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 951 OF 1240 *** + // Wavefunction(s) for diagram number 951 + VVV1P0_1( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] ); + // Amplitude(s) for diagram number 951 + VVV1_0( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram952( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 952 OF 1240 *** + // Wavefunction(s) for diagram number 952 + // (none) + // Amplitude(s) for diagram number 952 + VVV1_0( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram953( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 953 OF 1240 *** + // Wavefunction(s) for diagram number 953 + // (none) + // Amplitude(s) for diagram number 953 + VVVV1_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVVV3_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + VVVV4_0( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram954( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 954 OF 1240 *** + // Wavefunction(s) for diagram number 954 + // (none) + // Amplitude(s) for diagram number 954 + VVV1_0( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram955( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 955 OF 1240 *** + // Wavefunction(s) for diagram number 955 + // (none) + // Amplitude(s) for diagram number 955 + VVV1_0( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram956( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 956 OF 1240 *** + // Wavefunction(s) for diagram number 956 + // (none) + // Amplitude(s) for diagram number 956 + VVVV1_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + VVVV3_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + VVVV4_0( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram957( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 957 OF 1240 *** + // Wavefunction(s) for diagram number 957 + // (none) + // Amplitude(s) for diagram number 957 + VVV1_0( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram958( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 958 OF 1240 *** + // Wavefunction(s) for diagram number 958 + // (none) + // Amplitude(s) for diagram number 958 + VVV1_0( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram959( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 959 OF 1240 *** + // Wavefunction(s) for diagram number 959 + VVVV1P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] ); + VVVV3P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] ); + VVVV4P0_1( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 959 + VVV1_0( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram960( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 960 OF 1240 *** + // Wavefunction(s) for diagram number 960 + VVVV1P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] ); + VVVV3P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] ); + VVVV4P0_1( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] ); + // Amplitude(s) for diagram number 960 + VVV1_0( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram961( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 961 OF 1240 *** + // Wavefunction(s) for diagram number 961 + // (none) + // Amplitude(s) for diagram number 961 + VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVV1_0( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + VVV1_0( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram962( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 962 OF 1240 *** + // Wavefunction(s) for diagram number 962 + // (none) + // Amplitude(s) for diagram number 962 + VVV1_0( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVV1_0( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + VVV1_0( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram963( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 963 OF 1240 *** + // Wavefunction(s) for diagram number 963 + // (none) + // Amplitude(s) for diagram number 963 + VVVV1_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram964( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 964 OF 1240 *** + // Wavefunction(s) for diagram number 964 + // (none) + // Amplitude(s) for diagram number 964 + VVV1_0( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram965( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 965 OF 1240 *** + // Wavefunction(s) for diagram number 965 + // (none) + // Amplitude(s) for diagram number 965 + VVV1_0( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram966( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 966 OF 1240 *** + // Wavefunction(s) for diagram number 966 + // (none) + // Amplitude(s) for diagram number 966 + VVV1_0( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram967( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 967 OF 1240 *** + // Wavefunction(s) for diagram number 967 + // (none) + // Amplitude(s) for diagram number 967 + VVV1_0( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram968( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 968 OF 1240 *** + // Wavefunction(s) for diagram number 968 + // (none) + // Amplitude(s) for diagram number 968 + FFV1_0( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram969( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 969 OF 1240 *** + // Wavefunction(s) for diagram number 969 + // (none) + // Amplitude(s) for diagram number 969 + FFV1_0( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram970( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 970 OF 1240 *** + // Wavefunction(s) for diagram number 970 + // (none) + // Amplitude(s) for diagram number 970 + FFV1_0( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram971( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 971 OF 1240 *** + // Wavefunction(s) for diagram number 971 + // (none) + // Amplitude(s) for diagram number 971 + FFV1_0( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram972( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 972 OF 1240 *** + // Wavefunction(s) for diagram number 972 + // (none) + // Amplitude(s) for diagram number 972 + VVV1_0( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram973( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 973 OF 1240 *** + // Wavefunction(s) for diagram number 973 + // (none) + // Amplitude(s) for diagram number 973 + FFV1_0( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram974( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 974 OF 1240 *** + // Wavefunction(s) for diagram number 974 + // (none) + // Amplitude(s) for diagram number 974 + FFV1_0( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram975( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 975 OF 1240 *** + // Wavefunction(s) for diagram number 975 + // (none) + // Amplitude(s) for diagram number 975 + FFV1_0( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram976( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 976 OF 1240 *** + // Wavefunction(s) for diagram number 976 + // (none) + // Amplitude(s) for diagram number 976 + FFV1_0( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram977( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 977 OF 1240 *** + // Wavefunction(s) for diagram number 977 + // (none) + // Amplitude(s) for diagram number 977 + VVV1_0( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram978( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 978 OF 1240 *** + // Wavefunction(s) for diagram number 978 + // (none) + // Amplitude(s) for diagram number 978 + FFV1_0( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram979( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 979 OF 1240 *** + // Wavefunction(s) for diagram number 979 + // (none) + // Amplitude(s) for diagram number 979 + FFV1_0( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram980( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 980 OF 1240 *** + // Wavefunction(s) for diagram number 980 + // (none) + // Amplitude(s) for diagram number 980 + FFV1_0( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram981( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 981 OF 1240 *** + // Wavefunction(s) for diagram number 981 + // (none) + // Amplitude(s) for diagram number 981 + FFV1_0( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram982( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 982 OF 1240 *** + // Wavefunction(s) for diagram number 982 + // (none) + // Amplitude(s) for diagram number 982 + VVV1_0( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram983( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 983 OF 1240 *** + // Wavefunction(s) for diagram number 983 + // (none) + // Amplitude(s) for diagram number 983 + FFV1_0( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram984( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 984 OF 1240 *** + // Wavefunction(s) for diagram number 984 + // (none) + // Amplitude(s) for diagram number 984 + FFV1_0( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram985( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 985 OF 1240 *** + // Wavefunction(s) for diagram number 985 + // (none) + // Amplitude(s) for diagram number 985 + FFV1_0( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram986( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 986 OF 1240 *** + // Wavefunction(s) for diagram number 986 + // (none) + // Amplitude(s) for diagram number 986 + FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram987( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 987 OF 1240 *** + // Wavefunction(s) for diagram number 987 + // (none) + // Amplitude(s) for diagram number 987 + VVV1_0( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram988( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 988 OF 1240 *** + // Wavefunction(s) for diagram number 988 + // (none) + // Amplitude(s) for diagram number 988 + FFV1_0( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram989( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 989 OF 1240 *** + // Wavefunction(s) for diagram number 989 + // (none) + // Amplitude(s) for diagram number 989 + FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram990( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 990 OF 1240 *** + // Wavefunction(s) for diagram number 990 + // (none) + // Amplitude(s) for diagram number 990 + FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram991( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 991 OF 1240 *** + // Wavefunction(s) for diagram number 991 + // (none) + // Amplitude(s) for diagram number 991 + FFV1_0( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram992( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 992 OF 1240 *** + // Wavefunction(s) for diagram number 992 + // (none) + // Amplitude(s) for diagram number 992 + VVV1_0( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram993( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 993 OF 1240 *** + // Wavefunction(s) for diagram number 993 + // (none) + // Amplitude(s) for diagram number 993 + FFV1_0( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram994( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 994 OF 1240 *** + // Wavefunction(s) for diagram number 994 + // (none) + // Amplitude(s) for diagram number 994 + VVV1_0( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram995( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 995 OF 1240 *** + // Wavefunction(s) for diagram number 995 + // (none) + // Amplitude(s) for diagram number 995 + FFV1_0( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram996( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 996 OF 1240 *** + // Wavefunction(s) for diagram number 996 + // (none) + // Amplitude(s) for diagram number 996 + FFV1_0( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram997( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 997 OF 1240 *** + // Wavefunction(s) for diagram number 997 + // (none) + // Amplitude(s) for diagram number 997 + FFV1_0( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram998( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 998 OF 1240 *** + // Wavefunction(s) for diagram number 998 + // (none) + // Amplitude(s) for diagram number 998 + FFV1_0( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram999( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 999 OF 1240 *** + // Wavefunction(s) for diagram number 999 + // (none) + // Amplitude(s) for diagram number 999 + VVV1_0( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1000( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1000 OF 1240 *** + // Wavefunction(s) for diagram number 1000 + // (none) + // Amplitude(s) for diagram number 1000 + FFV1_0( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1001( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1001 OF 1240 *** + // Wavefunction(s) for diagram number 1001 + // (none) + // Amplitude(s) for diagram number 1001 + FFV1_0( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1002( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1002 OF 1240 *** + // Wavefunction(s) for diagram number 1002 + // (none) + // Amplitude(s) for diagram number 1002 + FFV1_0( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1003( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1003 OF 1240 *** + // Wavefunction(s) for diagram number 1003 + // (none) + // Amplitude(s) for diagram number 1003 + FFV1_0( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1004( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1004 OF 1240 *** + // Wavefunction(s) for diagram number 1004 + // (none) + // Amplitude(s) for diagram number 1004 + FFV1_0( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1005( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1005 OF 1240 *** + // Wavefunction(s) for diagram number 1005 + // (none) + // Amplitude(s) for diagram number 1005 + FFV1_0( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1006( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1006 OF 1240 *** + // Wavefunction(s) for diagram number 1006 + // (none) + // Amplitude(s) for diagram number 1006 + FFV1_0( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1007( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1007 OF 1240 *** + // Wavefunction(s) for diagram number 1007 + // (none) + // Amplitude(s) for diagram number 1007 + VVV1_0( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1008( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1008 OF 1240 *** + // Wavefunction(s) for diagram number 1008 + // (none) + // Amplitude(s) for diagram number 1008 + VVV1_0( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1009( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1009 OF 1240 *** + // Wavefunction(s) for diagram number 1009 + // (none) + // Amplitude(s) for diagram number 1009 + VVVV1_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1010( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1010 OF 1240 *** + // Wavefunction(s) for diagram number 1010 + // (none) + // Amplitude(s) for diagram number 1010 + VVV1_0( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1011( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1011 OF 1240 *** + // Wavefunction(s) for diagram number 1011 + // (none) + // Amplitude(s) for diagram number 1011 + VVV1_0( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1012( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1012 OF 1240 *** + // Wavefunction(s) for diagram number 1012 + // (none) + // Amplitude(s) for diagram number 1012 + VVVV1_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1013( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1013 OF 1240 *** + // Wavefunction(s) for diagram number 1013 + // (none) + // Amplitude(s) for diagram number 1013 + VVV1_0( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1014( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1014 OF 1240 *** + // Wavefunction(s) for diagram number 1014 + // (none) + // Amplitude(s) for diagram number 1014 + VVV1_0( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1015( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1015 OF 1240 *** + // Wavefunction(s) for diagram number 1015 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] ); + // Amplitude(s) for diagram number 1015 + VVV1_0( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVV1_0( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVV1_0( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1016( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1016 OF 1240 *** + // Wavefunction(s) for diagram number 1016 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 1016 + VVV1_0( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1017( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1017 OF 1240 *** + // Wavefunction(s) for diagram number 1017 + // (none) + // Amplitude(s) for diagram number 1017 + VVV1_0( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1018( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1018 OF 1240 *** + // Wavefunction(s) for diagram number 1018 + // (none) + // Amplitude(s) for diagram number 1018 + VVV1_0( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1019( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1019 OF 1240 *** + // Wavefunction(s) for diagram number 1019 + // (none) + // Amplitude(s) for diagram number 1019 + VVV1_0( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1020( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1020 OF 1240 *** + // Wavefunction(s) for diagram number 1020 + // (none) + // Amplitude(s) for diagram number 1020 + VVV1_0( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1021( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1021 OF 1240 *** + // Wavefunction(s) for diagram number 1021 + // (none) + // Amplitude(s) for diagram number 1021 + VVVV1_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1022( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1022 OF 1240 *** + // Wavefunction(s) for diagram number 1022 + // (none) + // Amplitude(s) for diagram number 1022 + VVV1_0( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1023( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1023 OF 1240 *** + // Wavefunction(s) for diagram number 1023 + // (none) + // Amplitude(s) for diagram number 1023 + VVV1_0( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1024( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1024 OF 1240 *** + // Wavefunction(s) for diagram number 1024 + // (none) + // Amplitude(s) for diagram number 1024 + VVVV1_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1025( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1025 OF 1240 *** + // Wavefunction(s) for diagram number 1025 + // (none) + // Amplitude(s) for diagram number 1025 + VVV1_0( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1026( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1026 OF 1240 *** + // Wavefunction(s) for diagram number 1026 + // (none) + // Amplitude(s) for diagram number 1026 + VVV1_0( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1027( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1027 OF 1240 *** + // Wavefunction(s) for diagram number 1027 + // (none) + // Amplitude(s) for diagram number 1027 + VVV1_0( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVV1_0( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVV1_0( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1028( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1028 OF 1240 *** + // Wavefunction(s) for diagram number 1028 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 1028 + VVV1_0( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1029( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1029 OF 1240 *** + // Wavefunction(s) for diagram number 1029 + // (none) + // Amplitude(s) for diagram number 1029 + VVV1_0( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1030( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1030 OF 1240 *** + // Wavefunction(s) for diagram number 1030 + // (none) + // Amplitude(s) for diagram number 1030 + VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1031( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1031 OF 1240 *** + // Wavefunction(s) for diagram number 1031 + // (none) + // Amplitude(s) for diagram number 1031 + VVV1_0( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1032( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1032 OF 1240 *** + // Wavefunction(s) for diagram number 1032 + // (none) + // Amplitude(s) for diagram number 1032 + VVV1_0( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1033( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1033 OF 1240 *** + // Wavefunction(s) for diagram number 1033 + // (none) + // Amplitude(s) for diagram number 1033 + VVVV1_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1034( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1034 OF 1240 *** + // Wavefunction(s) for diagram number 1034 + // (none) + // Amplitude(s) for diagram number 1034 + VVV1_0( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1035( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1035 OF 1240 *** + // Wavefunction(s) for diagram number 1035 + // (none) + // Amplitude(s) for diagram number 1035 + VVV1_0( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1036( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1036 OF 1240 *** + // Wavefunction(s) for diagram number 1036 + // (none) + // Amplitude(s) for diagram number 1036 + VVVV1_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV3_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV4_0( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1037( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1037 OF 1240 *** + // Wavefunction(s) for diagram number 1037 + // (none) + // Amplitude(s) for diagram number 1037 + VVV1_0( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1038( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1038 OF 1240 *** + // Wavefunction(s) for diagram number 1038 + // (none) + // Amplitude(s) for diagram number 1038 + VVV1_0( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1039( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1039 OF 1240 *** + // Wavefunction(s) for diagram number 1039 + // (none) + // Amplitude(s) for diagram number 1039 + VVV1_0( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVV1_0( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVV1_0( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1040( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1040 OF 1240 *** + // Wavefunction(s) for diagram number 1040 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 1040 + VVV1_0( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1041( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1041 OF 1240 *** + // Wavefunction(s) for diagram number 1041 + // (none) + // Amplitude(s) for diagram number 1041 + VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1042( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1042 OF 1240 *** + // Wavefunction(s) for diagram number 1042 + // (none) + // Amplitude(s) for diagram number 1042 + VVV1_0( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1043( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1043 OF 1240 *** + // Wavefunction(s) for diagram number 1043 + // (none) + // Amplitude(s) for diagram number 1043 + VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1044( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1044 OF 1240 *** + // Wavefunction(s) for diagram number 1044 + // (none) + // Amplitude(s) for diagram number 1044 + VVV1_0( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1045( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1045 OF 1240 *** + // Wavefunction(s) for diagram number 1045 + // (none) + // Amplitude(s) for diagram number 1045 + VVV1_0( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1046( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1046 OF 1240 *** + // Wavefunction(s) for diagram number 1046 + // (none) + // Amplitude(s) for diagram number 1046 + FFV1_0( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1047( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1047 OF 1240 *** + // Wavefunction(s) for diagram number 1047 + // (none) + // Amplitude(s) for diagram number 1047 + FFV1_0( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1048( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1048 OF 1240 *** + // Wavefunction(s) for diagram number 1048 + // (none) + // Amplitude(s) for diagram number 1048 + FFV1_0( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1049( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1049 OF 1240 *** + // Wavefunction(s) for diagram number 1049 + // (none) + // Amplitude(s) for diagram number 1049 + FFV1_0( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1050( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1050 OF 1240 *** + // Wavefunction(s) for diagram number 1050 + // (none) + // Amplitude(s) for diagram number 1050 + FFV1_0( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1051( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1051 OF 1240 *** + // Wavefunction(s) for diagram number 1051 + // (none) + // Amplitude(s) for diagram number 1051 + FFV1_0( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1052( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1052 OF 1240 *** + // Wavefunction(s) for diagram number 1052 + // (none) + // Amplitude(s) for diagram number 1052 + FFV1_0( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1053( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1053 OF 1240 *** + // Wavefunction(s) for diagram number 1053 + // (none) + // Amplitude(s) for diagram number 1053 + FFV1_0( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1054( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1054 OF 1240 *** + // Wavefunction(s) for diagram number 1054 + // (none) + // Amplitude(s) for diagram number 1054 + FFV1_0( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1055( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1055 OF 1240 *** + // Wavefunction(s) for diagram number 1055 + // (none) + // Amplitude(s) for diagram number 1055 + FFV1_0( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1056( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1056 OF 1240 *** + // Wavefunction(s) for diagram number 1056 + // (none) + // Amplitude(s) for diagram number 1056 + FFV1_0( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1057( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1057 OF 1240 *** + // Wavefunction(s) for diagram number 1057 + // (none) + // Amplitude(s) for diagram number 1057 + FFV1_0( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1058( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1058 OF 1240 *** + // Wavefunction(s) for diagram number 1058 + // (none) + // Amplitude(s) for diagram number 1058 + FFV1_0( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1059( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1059 OF 1240 *** + // Wavefunction(s) for diagram number 1059 + // (none) + // Amplitude(s) for diagram number 1059 + FFV1_0( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1060( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1060 OF 1240 *** + // Wavefunction(s) for diagram number 1060 + // (none) + // Amplitude(s) for diagram number 1060 + FFV1_0( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1061( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1061 OF 1240 *** + // Wavefunction(s) for diagram number 1061 + // (none) + // Amplitude(s) for diagram number 1061 + VVV1_0( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1062( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1062 OF 1240 *** + // Wavefunction(s) for diagram number 1062 + // (none) + // Amplitude(s) for diagram number 1062 + FFV1_0( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1063( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1063 OF 1240 *** + // Wavefunction(s) for diagram number 1063 + // (none) + // Amplitude(s) for diagram number 1063 + VVV1_0( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1064( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1064 OF 1240 *** + // Wavefunction(s) for diagram number 1064 + // (none) + // Amplitude(s) for diagram number 1064 + FFV1_0( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1065( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1065 OF 1240 *** + // Wavefunction(s) for diagram number 1065 + // (none) + // Amplitude(s) for diagram number 1065 + FFV1_0( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1066( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1066 OF 1240 *** + // Wavefunction(s) for diagram number 1066 + // (none) + // Amplitude(s) for diagram number 1066 + FFV1_0( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1067( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1067 OF 1240 *** + // Wavefunction(s) for diagram number 1067 + // (none) + // Amplitude(s) for diagram number 1067 + FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1068( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1068 OF 1240 *** + // Wavefunction(s) for diagram number 1068 + // (none) + // Amplitude(s) for diagram number 1068 + FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1069( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1069 OF 1240 *** + // Wavefunction(s) for diagram number 1069 + // (none) + // Amplitude(s) for diagram number 1069 + FFV1_0( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1070( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1070 OF 1240 *** + // Wavefunction(s) for diagram number 1070 + // (none) + // Amplitude(s) for diagram number 1070 + FFV1_0( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1071( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1071 OF 1240 *** + // Wavefunction(s) for diagram number 1071 + // (none) + // Amplitude(s) for diagram number 1071 + FFV1_0( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1072( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1072 OF 1240 *** + // Wavefunction(s) for diagram number 1072 + // (none) + // Amplitude(s) for diagram number 1072 + FFV1_0( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1073( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1073 OF 1240 *** + // Wavefunction(s) for diagram number 1073 + // (none) + // Amplitude(s) for diagram number 1073 + FFV1_0( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1074( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1074 OF 1240 *** + // Wavefunction(s) for diagram number 1074 + // (none) + // Amplitude(s) for diagram number 1074 + FFV1_0( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1075( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1075 OF 1240 *** + // Wavefunction(s) for diagram number 1075 + // (none) + // Amplitude(s) for diagram number 1075 + FFV1_0( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1076( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1076 OF 1240 *** + // Wavefunction(s) for diagram number 1076 + // (none) + // Amplitude(s) for diagram number 1076 + FFV1_0( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1077( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1077 OF 1240 *** + // Wavefunction(s) for diagram number 1077 + // (none) + // Amplitude(s) for diagram number 1077 + FFV1_0( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1078( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1078 OF 1240 *** + // Wavefunction(s) for diagram number 1078 + // (none) + // Amplitude(s) for diagram number 1078 + FFV1_0( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1079( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1079 OF 1240 *** + // Wavefunction(s) for diagram number 1079 + // (none) + // Amplitude(s) for diagram number 1079 + FFV1_0( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1080( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1080 OF 1240 *** + // Wavefunction(s) for diagram number 1080 + // (none) + // Amplitude(s) for diagram number 1080 + VVV1_0( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1081( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1081 OF 1240 *** + // Wavefunction(s) for diagram number 1081 + // (none) + // Amplitude(s) for diagram number 1081 + FFV1_0( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1082( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1082 OF 1240 *** + // Wavefunction(s) for diagram number 1082 + // (none) + // Amplitude(s) for diagram number 1082 + VVV1_0( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1083( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1083 OF 1240 *** + // Wavefunction(s) for diagram number 1083 + // (none) + // Amplitude(s) for diagram number 1083 + FFV1_0( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1084( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1084 OF 1240 *** + // Wavefunction(s) for diagram number 1084 + // (none) + // Amplitude(s) for diagram number 1084 + FFV1_0( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1085( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1085 OF 1240 *** + // Wavefunction(s) for diagram number 1085 + // (none) + // Amplitude(s) for diagram number 1085 + FFV1_0( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1086( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1086 OF 1240 *** + // Wavefunction(s) for diagram number 1086 + // (none) + // Amplitude(s) for diagram number 1086 + FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1087( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1087 OF 1240 *** + // Wavefunction(s) for diagram number 1087 + // (none) + // Amplitude(s) for diagram number 1087 + FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1088( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1088 OF 1240 *** + // Wavefunction(s) for diagram number 1088 + // (none) + // Amplitude(s) for diagram number 1088 + FFV1_0( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1089( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1089 OF 1240 *** + // Wavefunction(s) for diagram number 1089 + // (none) + // Amplitude(s) for diagram number 1089 + FFV1_0( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1090( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1090 OF 1240 *** + // Wavefunction(s) for diagram number 1090 + // (none) + // Amplitude(s) for diagram number 1090 + FFV1_0( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1091( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1091 OF 1240 *** + // Wavefunction(s) for diagram number 1091 + // (none) + // Amplitude(s) for diagram number 1091 + FFV1_0( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1092( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1092 OF 1240 *** + // Wavefunction(s) for diagram number 1092 + // (none) + // Amplitude(s) for diagram number 1092 + FFV1_0( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1093( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1093 OF 1240 *** + // Wavefunction(s) for diagram number 1093 + // (none) + // Amplitude(s) for diagram number 1093 + FFV1_0( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1094( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1094 OF 1240 *** + // Wavefunction(s) for diagram number 1094 + // (none) + // Amplitude(s) for diagram number 1094 + FFV1_0( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1095( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1095 OF 1240 *** + // Wavefunction(s) for diagram number 1095 + // (none) + // Amplitude(s) for diagram number 1095 + FFV1_0( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1096( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1096 OF 1240 *** + // Wavefunction(s) for diagram number 1096 + // (none) + // Amplitude(s) for diagram number 1096 + FFV1_0( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1097( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1097 OF 1240 *** + // Wavefunction(s) for diagram number 1097 + // (none) + // Amplitude(s) for diagram number 1097 + FFV1_0( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1098( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1098 OF 1240 *** + // Wavefunction(s) for diagram number 1098 + // (none) + // Amplitude(s) for diagram number 1098 + FFV1_0( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1099( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1099 OF 1240 *** + // Wavefunction(s) for diagram number 1099 + // (none) + // Amplitude(s) for diagram number 1099 + VVV1_0( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1100( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1100 OF 1240 *** + // Wavefunction(s) for diagram number 1100 + // (none) + // Amplitude(s) for diagram number 1100 + FFV1_0( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1101( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1101 OF 1240 *** + // Wavefunction(s) for diagram number 1101 + // (none) + // Amplitude(s) for diagram number 1101 + VVV1_0( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1102( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1102 OF 1240 *** + // Wavefunction(s) for diagram number 1102 + // (none) + // Amplitude(s) for diagram number 1102 + FFV1_0( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1103( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1103 OF 1240 *** + // Wavefunction(s) for diagram number 1103 + // (none) + // Amplitude(s) for diagram number 1103 + FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1104( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1104 OF 1240 *** + // Wavefunction(s) for diagram number 1104 + // (none) + // Amplitude(s) for diagram number 1104 + FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1105( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1105 OF 1240 *** + // Wavefunction(s) for diagram number 1105 + // (none) + // Amplitude(s) for diagram number 1105 + FFV1_0( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1106( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1106 OF 1240 *** + // Wavefunction(s) for diagram number 1106 + // (none) + // Amplitude(s) for diagram number 1106 + VVV1_0( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1107( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1107 OF 1240 *** + // Wavefunction(s) for diagram number 1107 + // (none) + // Amplitude(s) for diagram number 1107 + FFV1_0( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1108( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1108 OF 1240 *** + // Wavefunction(s) for diagram number 1108 + // (none) + // Amplitude(s) for diagram number 1108 + VVV1_0( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1109( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1109 OF 1240 *** + // Wavefunction(s) for diagram number 1109 + // (none) + // Amplitude(s) for diagram number 1109 + FFV1_0( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1110( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1110 OF 1240 *** + // Wavefunction(s) for diagram number 1110 + // (none) + // Amplitude(s) for diagram number 1110 + FFV1_0( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1111( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1111 OF 1240 *** + // Wavefunction(s) for diagram number 1111 + // (none) + // Amplitude(s) for diagram number 1111 + FFV1_0( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1112( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1112 OF 1240 *** + // Wavefunction(s) for diagram number 1112 + // (none) + // Amplitude(s) for diagram number 1112 + FFV1_0( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1113( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1113 OF 1240 *** + // Wavefunction(s) for diagram number 1113 + // (none) + // Amplitude(s) for diagram number 1113 + VVV1_0( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1114( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1114 OF 1240 *** + // Wavefunction(s) for diagram number 1114 + // (none) + // Amplitude(s) for diagram number 1114 + FFV1_0( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1115( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1115 OF 1240 *** + // Wavefunction(s) for diagram number 1115 + // (none) + // Amplitude(s) for diagram number 1115 + VVV1_0( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1116( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1116 OF 1240 *** + // Wavefunction(s) for diagram number 1116 + // (none) + // Amplitude(s) for diagram number 1116 + FFV1_0( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1117( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1117 OF 1240 *** + // Wavefunction(s) for diagram number 1117 + // (none) + // Amplitude(s) for diagram number 1117 + FFV1_0( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1118( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1118 OF 1240 *** + // Wavefunction(s) for diagram number 1118 + // (none) + // Amplitude(s) for diagram number 1118 + FFV1_0( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1119( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1119 OF 1240 *** + // Wavefunction(s) for diagram number 1119 + // (none) + // Amplitude(s) for diagram number 1119 + FFV1_0( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1120( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1120 OF 1240 *** + // Wavefunction(s) for diagram number 1120 + // (none) + // Amplitude(s) for diagram number 1120 + VVV1_0( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1121( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1121 OF 1240 *** + // Wavefunction(s) for diagram number 1121 + // (none) + // Amplitude(s) for diagram number 1121 + FFV1_0( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1122( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1122 OF 1240 *** + // Wavefunction(s) for diagram number 1122 + // (none) + // Amplitude(s) for diagram number 1122 + VVV1_0( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1123( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1123 OF 1240 *** + // Wavefunction(s) for diagram number 1123 + // (none) + // Amplitude(s) for diagram number 1123 + FFV1_0( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1124( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1124 OF 1240 *** + // Wavefunction(s) for diagram number 1124 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] ); + // Amplitude(s) for diagram number 1124 + VVVV1_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVVV1_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV3_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV4_0( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVVV1_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV3_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1125( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1125 OF 1240 *** + // Wavefunction(s) for diagram number 1125 + VVV1P0_1( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] ); + VVV1P0_1( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); + VVV1P0_1( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] ); + // Amplitude(s) for diagram number 1125 + VVV1_0( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1126( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1126 OF 1240 *** + // Wavefunction(s) for diagram number 1126 + VVV1P0_1( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] ); + VVV1P0_1( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] ); + VVV1P0_1( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 1126 + VVV1_0( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1127( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1127 OF 1240 *** + // Wavefunction(s) for diagram number 1127 + // (none) + // Amplitude(s) for diagram number 1127 + VVV1_0( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1128( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1128 OF 1240 *** + // Wavefunction(s) for diagram number 1128 + FFV1_2( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_2( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); + FFV1_2( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); + // Amplitude(s) for diagram number 1128 + FFV1_0( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + FFV1_0( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + FFV1_0( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1129( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1129 OF 1240 *** + // Wavefunction(s) for diagram number 1129 + // (none) + // Amplitude(s) for diagram number 1129 + FFV1_0( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1130( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1130 OF 1240 *** + // Wavefunction(s) for diagram number 1130 + // (none) + // Amplitude(s) for diagram number 1130 + FFV1_0( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1131( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1131 OF 1240 *** + // Wavefunction(s) for diagram number 1131 + // (none) + // Amplitude(s) for diagram number 1131 + FFV1_0( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + FFV1_0( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + FFV1_0( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1132( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1132 OF 1240 *** + // Wavefunction(s) for diagram number 1132 + // (none) + // Amplitude(s) for diagram number 1132 + FFV1_0( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1133( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1133 OF 1240 *** + // Wavefunction(s) for diagram number 1133 + // (none) + // Amplitude(s) for diagram number 1133 + FFV1_0( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1134( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1134 OF 1240 *** + // Wavefunction(s) for diagram number 1134 + FFV1_1( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + FFV1_1( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + FFV1_1( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); + // Amplitude(s) for diagram number 1134 + FFV1_0( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1135( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1135 OF 1240 *** + // Wavefunction(s) for diagram number 1135 + // (none) + // Amplitude(s) for diagram number 1135 + FFV1_0( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1136( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1136 OF 1240 *** + // Wavefunction(s) for diagram number 1136 + // (none) + // Amplitude(s) for diagram number 1136 + FFV1_0( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1137( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1137 OF 1240 *** + // Wavefunction(s) for diagram number 1137 + // (none) + // Amplitude(s) for diagram number 1137 + FFV1_0( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1138( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1138 OF 1240 *** + // Wavefunction(s) for diagram number 1138 + // (none) + // Amplitude(s) for diagram number 1138 + FFV1_0( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1139( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1139 OF 1240 *** + // Wavefunction(s) for diagram number 1139 + // (none) + // Amplitude(s) for diagram number 1139 + FFV1_0( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1140( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1140 OF 1240 *** + // Wavefunction(s) for diagram number 1140 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 1140 + VVVV1_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVVV3_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVVV4_0( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVVV1_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVVV3_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV4_0( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV1_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + VVVV3_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV4_0( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1141( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1141 OF 1240 *** + // Wavefunction(s) for diagram number 1141 + VVV1P0_1( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] ); + VVV1P0_1( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] ); + VVV1P0_1( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 1141 + VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1142( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1142 OF 1240 *** + // Wavefunction(s) for diagram number 1142 + VVV1P0_1( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] ); + VVV1P0_1( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] ); + VVV1P0_1( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] ); + // Amplitude(s) for diagram number 1142 + VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1143( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1143 OF 1240 *** + // Wavefunction(s) for diagram number 1143 + // (none) + // Amplitude(s) for diagram number 1143 + VVV1_0( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + VVV1_0( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + VVV1_0( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1144( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1144 OF 1240 *** + // Wavefunction(s) for diagram number 1144 + FFV1_2( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); + FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] ); + FFV1_2( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + // Amplitude(s) for diagram number 1144 + FFV1_0( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + FFV1_0( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + FFV1_0( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1145( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1145 OF 1240 *** + // Wavefunction(s) for diagram number 1145 + // (none) + // Amplitude(s) for diagram number 1145 + FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1146( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1146 OF 1240 *** + // Wavefunction(s) for diagram number 1146 + // (none) + // Amplitude(s) for diagram number 1146 + FFV1_0( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1147( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1147 OF 1240 *** + // Wavefunction(s) for diagram number 1147 + // (none) + // Amplitude(s) for diagram number 1147 + FFV1_0( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0]; + FFV1_0( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + FFV1_0( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1148( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1148 OF 1240 *** + // Wavefunction(s) for diagram number 1148 + // (none) + // Amplitude(s) for diagram number 1148 + FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1149( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1149 OF 1240 *** + // Wavefunction(s) for diagram number 1149 + // (none) + // Amplitude(s) for diagram number 1149 + FFV1_0( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1150( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1150 OF 1240 *** + // Wavefunction(s) for diagram number 1150 + FFV1_1( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); + FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); + FFV1_1( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] ); + // Amplitude(s) for diagram number 1150 + FFV1_0( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1151( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1151 OF 1240 *** + // Wavefunction(s) for diagram number 1151 + // (none) + // Amplitude(s) for diagram number 1151 + FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1152( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1152 OF 1240 *** + // Wavefunction(s) for diagram number 1152 + // (none) + // Amplitude(s) for diagram number 1152 + FFV1_0( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1153( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1153 OF 1240 *** + // Wavefunction(s) for diagram number 1153 + // (none) + // Amplitude(s) for diagram number 1153 + FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1154( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1154 OF 1240 *** + // Wavefunction(s) for diagram number 1154 + // (none) + // Amplitude(s) for diagram number 1154 + FFV1_0( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1155( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1155 OF 1240 *** + // Wavefunction(s) for diagram number 1155 + // (none) + // Amplitude(s) for diagram number 1155 + FFV1_0( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1156( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1156 OF 1240 *** + // Wavefunction(s) for diagram number 1156 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] ); + // Amplitude(s) for diagram number 1156 + VVVV1_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVVV3_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + VVVV4_0( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + VVVV1_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + VVVV3_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + VVVV4_0( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + VVVV1_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + VVVV3_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + VVVV4_0( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1157( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1157 OF 1240 *** + // Wavefunction(s) for diagram number 1157 + VVV1P0_1( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] ); + VVV1P0_1( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] ); + VVV1P0_1( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] ); + // Amplitude(s) for diagram number 1157 + VVV1_0( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1158( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1158 OF 1240 *** + // Wavefunction(s) for diagram number 1158 + VVV1P0_1( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] ); + VVV1P0_1( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] ); + VVV1P0_1( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] ); + // Amplitude(s) for diagram number 1158 + VVV1_0( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1159( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1159 OF 1240 *** + // Wavefunction(s) for diagram number 1159 + // (none) + // Amplitude(s) for diagram number 1159 + VVV1_0( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + VVV1_0( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + VVV1_0( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1160( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1160 OF 1240 *** + // Wavefunction(s) for diagram number 1160 + FFV1_2( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_2( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + FFV1_2( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); + // Amplitude(s) for diagram number 1160 + FFV1_0( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0]; + FFV1_0( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + FFV1_0( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1161( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1161 OF 1240 *** + // Wavefunction(s) for diagram number 1161 + // (none) + // Amplitude(s) for diagram number 1161 + FFV1_0( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1162( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1162 OF 1240 *** + // Wavefunction(s) for diagram number 1162 + // (none) + // Amplitude(s) for diagram number 1162 + FFV1_0( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1163( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1163 OF 1240 *** + // Wavefunction(s) for diagram number 1163 + // (none) + // Amplitude(s) for diagram number 1163 + FFV1_0( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0]; + FFV1_0( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + FFV1_0( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1164( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1164 OF 1240 *** + // Wavefunction(s) for diagram number 1164 + // (none) + // Amplitude(s) for diagram number 1164 + FFV1_0( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1165( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1165 OF 1240 *** + // Wavefunction(s) for diagram number 1165 + // (none) + // Amplitude(s) for diagram number 1165 + FFV1_0( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1166( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1166 OF 1240 *** + // Wavefunction(s) for diagram number 1166 + FFV1_1( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + FFV1_1( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + FFV1_1( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); + // Amplitude(s) for diagram number 1166 + FFV1_0( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1167( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1167 OF 1240 *** + // Wavefunction(s) for diagram number 1167 + // (none) + // Amplitude(s) for diagram number 1167 + FFV1_0( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1168( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1168 OF 1240 *** + // Wavefunction(s) for diagram number 1168 + // (none) + // Amplitude(s) for diagram number 1168 + FFV1_0( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1169( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1169 OF 1240 *** + // Wavefunction(s) for diagram number 1169 + // (none) + // Amplitude(s) for diagram number 1169 + FFV1_0( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1170( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1170 OF 1240 *** + // Wavefunction(s) for diagram number 1170 + // (none) + // Amplitude(s) for diagram number 1170 + FFV1_0( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1171( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1171 OF 1240 *** + // Wavefunction(s) for diagram number 1171 + // (none) + // Amplitude(s) for diagram number 1171 + FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1172( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1172 OF 1240 *** + // Wavefunction(s) for diagram number 1172 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] ); + FFV1_2( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_2( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); + FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] ); + // Amplitude(s) for diagram number 1172 + FFV1_0( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + FFV1_0( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1173( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1173 OF 1240 *** + // Wavefunction(s) for diagram number 1173 + VVV1P0_1( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] ); + VVV1P0_1( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] ); + VVV1P0_1( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] ); + // Amplitude(s) for diagram number 1173 + FFV1_0( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1174( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1174 OF 1240 *** + // Wavefunction(s) for diagram number 1174 + // (none) + // Amplitude(s) for diagram number 1174 + FFV1_0( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1175( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1175 OF 1240 *** + // Wavefunction(s) for diagram number 1175 + FFV1_1( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] ); + FFV1_1( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); + FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 1175 + FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1176( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1176 OF 1240 *** + // Wavefunction(s) for diagram number 1176 + // (none) + // Amplitude(s) for diagram number 1176 + FFV1_0( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1177( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1177 OF 1240 *** + // Wavefunction(s) for diagram number 1177 + // (none) + // Amplitude(s) for diagram number 1177 + FFV1_0( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1178( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1178 OF 1240 *** + // Wavefunction(s) for diagram number 1178 + // (none) + // Amplitude(s) for diagram number 1178 + FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1179( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1179 OF 1240 *** + // Wavefunction(s) for diagram number 1179 + // (none) + // Amplitude(s) for diagram number 1179 + FFV1_0( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1180( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1180 OF 1240 *** + // Wavefunction(s) for diagram number 1180 + // (none) + // Amplitude(s) for diagram number 1180 + VVV1_0( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVV1_0( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVV1_0( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1181( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1181 OF 1240 *** + // Wavefunction(s) for diagram number 1181 + // (none) + // Amplitude(s) for diagram number 1181 + VVVV1_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVVV3_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + VVVV4_0( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV1_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVVV3_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + VVVV4_0( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV1_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVVV3_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1182( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1182 OF 1240 *** + // Wavefunction(s) for diagram number 1182 + VVV1P0_1( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] ); + VVV1P0_1( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] ); + VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 1182 + VVV1_0( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1183( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1183 OF 1240 *** + // Wavefunction(s) for diagram number 1183 + // (none) + // Amplitude(s) for diagram number 1183 + VVV1_0( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1184( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1184 OF 1240 *** + // Wavefunction(s) for diagram number 1184 + // (none) + // Amplitude(s) for diagram number 1184 + FFV1_0( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1185( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1185 OF 1240 *** + // Wavefunction(s) for diagram number 1185 + // (none) + // Amplitude(s) for diagram number 1185 + FFV1_0( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0]; + FFV1_0( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1186( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1186 OF 1240 *** + // Wavefunction(s) for diagram number 1186 + // (none) + // Amplitude(s) for diagram number 1186 + FFV1_0( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1187( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1187 OF 1240 *** + // Wavefunction(s) for diagram number 1187 + // (none) + // Amplitude(s) for diagram number 1187 + FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + FFV1_0( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1188( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1188 OF 1240 *** + // Wavefunction(s) for diagram number 1188 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] ); + FFV1_2( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); + FFV1_2( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] ); + FFV1_2( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] ); + // Amplitude(s) for diagram number 1188 + FFV1_0( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + FFV1_0( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + FFV1_0( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1189( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1189 OF 1240 *** + // Wavefunction(s) for diagram number 1189 + VVV1P0_1( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] ); + VVV1P0_1( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] ); + VVV1P0_1( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] ); + // Amplitude(s) for diagram number 1189 + FFV1_0( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1190( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1190 OF 1240 *** + // Wavefunction(s) for diagram number 1190 + // (none) + // Amplitude(s) for diagram number 1190 + FFV1_0( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1191( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1191 OF 1240 *** + // Wavefunction(s) for diagram number 1191 + FFV1_1( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] ); + FFV1_1( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] ); + FFV1_1( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 1191 + FFV1_0( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1192( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1192 OF 1240 *** + // Wavefunction(s) for diagram number 1192 + // (none) + // Amplitude(s) for diagram number 1192 + FFV1_0( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1193( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1193 OF 1240 *** + // Wavefunction(s) for diagram number 1193 + // (none) + // Amplitude(s) for diagram number 1193 + FFV1_0( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1194( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1194 OF 1240 *** + // Wavefunction(s) for diagram number 1194 + // (none) + // Amplitude(s) for diagram number 1194 + FFV1_0( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1195( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1195 OF 1240 *** + // Wavefunction(s) for diagram number 1195 + // (none) + // Amplitude(s) for diagram number 1195 + FFV1_0( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1196( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1196 OF 1240 *** + // Wavefunction(s) for diagram number 1196 + // (none) + // Amplitude(s) for diagram number 1196 + VVV1_0( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVV1_0( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + VVV1_0( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1197( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1197 OF 1240 *** + // Wavefunction(s) for diagram number 1197 + // (none) + // Amplitude(s) for diagram number 1197 + VVVV1_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + VVVV3_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + VVVV4_0( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + VVVV1_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVVV3_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVVV4_0( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVVV1_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + VVVV3_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + VVVV4_0( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1198( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1198 OF 1240 *** + // Wavefunction(s) for diagram number 1198 + VVV1P0_1( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] ); + VVV1P0_1( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); + VVV1P0_1( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] ); + // Amplitude(s) for diagram number 1198 + VVV1_0( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1199( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1199 OF 1240 *** + // Wavefunction(s) for diagram number 1199 + // (none) + // Amplitude(s) for diagram number 1199 + VVV1_0( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1200( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1200 OF 1240 *** + // Wavefunction(s) for diagram number 1200 + // (none) + // Amplitude(s) for diagram number 1200 + FFV1_0( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1201( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1201 OF 1240 *** + // Wavefunction(s) for diagram number 1201 + // (none) + // Amplitude(s) for diagram number 1201 + FFV1_0( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0]; + FFV1_0( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + FFV1_0( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1202( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1202 OF 1240 *** + // Wavefunction(s) for diagram number 1202 + // (none) + // Amplitude(s) for diagram number 1202 + FFV1_0( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1203( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1203 OF 1240 *** + // Wavefunction(s) for diagram number 1203 + // (none) + // Amplitude(s) for diagram number 1203 + FFV1_0( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + FFV1_0( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1204( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1204 OF 1240 *** + // Wavefunction(s) for diagram number 1204 + VVVV1P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] ); + VVVV3P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] ); + VVVV4P0_1( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] ); + FFV1_2( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] ); + FFV1_2( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + FFV1_2( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] ); + // Amplitude(s) for diagram number 1204 + FFV1_0( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + FFV1_0( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + FFV1_0( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1205( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1205 OF 1240 *** + // Wavefunction(s) for diagram number 1205 + VVV1P0_1( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] ); + VVV1P0_1( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] ); + VVV1P0_1( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 1205 + FFV1_0( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1206( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1206 OF 1240 *** + // Wavefunction(s) for diagram number 1206 + // (none) + // Amplitude(s) for diagram number 1206 + FFV1_0( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1207( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1207 OF 1240 *** + // Wavefunction(s) for diagram number 1207 + FFV1_1( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] ); + FFV1_1( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_1( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] ); + // Amplitude(s) for diagram number 1207 + FFV1_0( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1208( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1208 OF 1240 *** + // Wavefunction(s) for diagram number 1208 + // (none) + // Amplitude(s) for diagram number 1208 + FFV1_0( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1209( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1209 OF 1240 *** + // Wavefunction(s) for diagram number 1209 + // (none) + // Amplitude(s) for diagram number 1209 + FFV1_0( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + FFV1_0( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1210( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1210 OF 1240 *** + // Wavefunction(s) for diagram number 1210 + // (none) + // Amplitude(s) for diagram number 1210 + FFV1_0( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1211( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1211 OF 1240 *** + // Wavefunction(s) for diagram number 1211 + // (none) + // Amplitude(s) for diagram number 1211 + FFV1_0( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1212( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1212 OF 1240 *** + // Wavefunction(s) for diagram number 1212 + // (none) + // Amplitude(s) for diagram number 1212 + VVV1_0( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVV1_0( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVV1_0( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1213( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1213 OF 1240 *** + // Wavefunction(s) for diagram number 1213 + // (none) + // Amplitude(s) for diagram number 1213 + VVVV1_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + VVVV3_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVVV4_0( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVVV1_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVVV3_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV4_0( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVVV1_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + VVVV3_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + VVVV4_0( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1214( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1214 OF 1240 *** + // Wavefunction(s) for diagram number 1214 + VVV1P0_1( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] ); + VVV1P0_1( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] ); + VVV1P0_1( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] ); + // Amplitude(s) for diagram number 1214 + VVV1_0( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1215( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1215 OF 1240 *** + // Wavefunction(s) for diagram number 1215 + // (none) + // Amplitude(s) for diagram number 1215 + VVV1_0( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + VVV1_0( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1216( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1216 OF 1240 *** + // Wavefunction(s) for diagram number 1216 + // (none) + // Amplitude(s) for diagram number 1216 + FFV1_0( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1217( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1217 OF 1240 *** + // Wavefunction(s) for diagram number 1217 + // (none) + // Amplitude(s) for diagram number 1217 + FFV1_0( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0]; + FFV1_0( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + FFV1_0( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1218( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1218 OF 1240 *** + // Wavefunction(s) for diagram number 1218 + // (none) + // Amplitude(s) for diagram number 1218 + FFV1_0( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1219( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1219 OF 1240 *** + // Wavefunction(s) for diagram number 1219 + // (none) + // Amplitude(s) for diagram number 1219 + FFV1_0( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + FFV1_0( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1220( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1220 OF 1240 *** + // Wavefunction(s) for diagram number 1220 + // (none) + // Amplitude(s) for diagram number 1220 + VVVV1_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1221( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1221 OF 1240 *** + // Wavefunction(s) for diagram number 1221 + VVV1P0_1( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] ); + VVV1P0_1( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] ); + VVV1P0_1( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] ); + // Amplitude(s) for diagram number 1221 + VVV1_0( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVV1_0( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1222( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1222 OF 1240 *** + // Wavefunction(s) for diagram number 1222 + // (none) + // Amplitude(s) for diagram number 1222 + VVV1_0( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0]; + VVV1_0( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVV1_0( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1223( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1223 OF 1240 *** + // Wavefunction(s) for diagram number 1223 + // (none) + // Amplitude(s) for diagram number 1223 + FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1224( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1224 OF 1240 *** + // Wavefunction(s) for diagram number 1224 + // (none) + // Amplitude(s) for diagram number 1224 + FFV1_0( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1225( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1225 OF 1240 *** + // Wavefunction(s) for diagram number 1225 + // (none) + // Amplitude(s) for diagram number 1225 + FFV1_0( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1226( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1226 OF 1240 *** + // Wavefunction(s) for diagram number 1226 + // (none) + // Amplitude(s) for diagram number 1226 + FFV1_0( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + FFV1_0( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1227( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1227 OF 1240 *** + // Wavefunction(s) for diagram number 1227 + // (none) + // Amplitude(s) for diagram number 1227 + VVVV1_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1228( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1228 OF 1240 *** + // Wavefunction(s) for diagram number 1228 + VVV1P0_1( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] ); + VVV1P0_1( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] ); + VVV1P0_1( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] ); + // Amplitude(s) for diagram number 1228 + VVV1_0( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1229( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1229 OF 1240 *** + // Wavefunction(s) for diagram number 1229 + // (none) + // Amplitude(s) for diagram number 1229 + VVV1_0( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0]; + VVV1_0( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + VVV1_0( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1230( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1230 OF 1240 *** + // Wavefunction(s) for diagram number 1230 + // (none) + // Amplitude(s) for diagram number 1230 + FFV1_0( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1231( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1231 OF 1240 *** + // Wavefunction(s) for diagram number 1231 + // (none) + // Amplitude(s) for diagram number 1231 + FFV1_0( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1232( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1232 OF 1240 *** + // Wavefunction(s) for diagram number 1232 + // (none) + // Amplitude(s) for diagram number 1232 + FFV1_0( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1233( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1233 OF 1240 *** + // Wavefunction(s) for diagram number 1233 + // (none) + // Amplitude(s) for diagram number 1233 + FFV1_0( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0]; + FFV1_0( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + FFV1_0( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1234( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1234 OF 1240 *** + // Wavefunction(s) for diagram number 1234 + // (none) + // Amplitude(s) for diagram number 1234 + VVVV1_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVVV1_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1235( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1235 OF 1240 *** + // Wavefunction(s) for diagram number 1235 + VVV1P0_1( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] ); + VVV1P0_1( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] ); + VVV1P0_1( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] ); + // Amplitude(s) for diagram number 1235 + VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + VVV1_0( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1236( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1236 OF 1240 *** + // Wavefunction(s) for diagram number 1236 + // (none) + // Amplitude(s) for diagram number 1236 + VVV1_0( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0]; + VVV1_0( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + VVV1_0( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1237( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1237 OF 1240 *** + // Wavefunction(s) for diagram number 1237 + // (none) + // Amplitude(s) for diagram number 1237 + FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1238( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1238 OF 1240 *** + // Wavefunction(s) for diagram number 1238 + // (none) + // Amplitude(s) for diagram number 1238 + FFV1_0( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1239( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1239 OF 1240 *** + // Wavefunction(s) for diagram number 1239 + // (none) + // Amplitude(s) for diagram number 1239 + FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + FFV1_0( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram1240( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 1240 OF 1240 *** + // Wavefunction(s) for diagram number 1240 + // (none) + // Amplitude(s) for diagram number 1240 + FFV1_0( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + FFV1_0( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h index 53dd560ed6..c30f753dcb 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -963,7 +963,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -976,7 +976,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -990,7 +990,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1003,7 +1003,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1017,7 +1017,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -1030,7 +1030,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1053,7 +1053,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1066,7 +1066,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1091,7 +1091,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1104,7 +1104,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1116,7 +1116,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1129,7 +1129,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1148,7 +1148,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1161,7 +1161,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1180,7 +1180,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1193,7 +1193,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1211,7 +1211,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -1226,7 +1226,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] ); @@ -1241,7 +1241,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1256,7 +1256,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1276,7 +1276,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -1291,7 +1291,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1306,7 +1306,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -1321,7 +1321,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1341,7 +1341,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1356,7 +1356,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1371,7 +1371,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1386,7 +1386,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc index 47a3a011b8..fd5642f3e3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index 76066c7bb1..f4b086fc96 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -310,7 +310,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -320,12 +320,12 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; GC_12s_sv = couplings_sv.GC_12; diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 1baee42e06..31ed3df613 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006198406219482422  +DEBUG: model prefixing takes 0.005407094955444336  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -171,16 +171,16 @@ Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -200,9 +200,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1665]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -211,62 +211,62 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s -Wrote files for 32 helas calls in 0.164 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1665]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s +Wrote files for 32 helas calls in 0.167 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.148 s +ALOHA: aloha creates 2 routines in 0.141 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.127 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 254 (offset 27 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 254 (offset 27 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m2.210s -user 0m1.890s -sys 0m0.303s -Code generation completed in 2 seconds +real 0m2.227s +user 0m1.929s +sys 0m0.297s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -279,7 +279,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -287,9 +287,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -309,7 +309,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -317,9 +317,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat index 795e11afaf..9dfd669871 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat index 66a805e521..3db737130c 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat @@ -109,6 +109,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat index 8c0f1e2199..47c2051950 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat @@ -109,6 +109,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt +++ b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/Source/.make_opts b/epochX/cudacpp/gq_ttq.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/.make_opts +++ b/epochX/cudacpp/gq_ttq.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f b/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc b/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc +++ b/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts +++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/gq_ttq.mad/Source/makefile b/epochX/cudacpp/gq_ttq.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/makefile +++ b/epochX/cudacpp/gq_ttq.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc b/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc +++ b/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 99573ab87a..1ee522dbfd 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,333 +282,143 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 5 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - FFV1_2( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 2 OF 5 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1P0_3( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 3 OF 5 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 5 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 5 OF 5 *** - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 5 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -610,7 +473,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +511,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -787,26 +662,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +689,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -970,20 +1049,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,17 +1068,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1031,93 +1107,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1228,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,25 +1237,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1219,8 +1271,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1290,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1397,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index b501a9772e..3956ab144a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 5; //static const int ncomb = 32; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index b0cc58e89c..2d49642e74 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index 2b281a8200..a45203b57e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagrams.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagrams.h new file mode 100644 index 0000000000..941311a2bb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagrams.h @@ -0,0 +1,174 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 5 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + FFV1_2( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 5 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1P0_3( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 5 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 5 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 5 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f index c2eadb2c31..10332b6238 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f index 1efce64e40..645a4d6016 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -341,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -387,7 +387,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -430,31 +431,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,3,2) T(5,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,3,4) T(5,2) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,5,2) T(3,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,5,4) T(3,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -507,10 +505,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -519,6 +519,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 6dc0abd17c..50d05d273c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,333 +282,143 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 5 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); - - FFV1_2( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 2 OF 5 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1P0_3( w_fp[4], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 3 OF 5 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 4 OF 5 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 5 OF 5 *** - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 5 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -610,7 +473,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +511,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -787,26 +662,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +689,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -970,20 +1049,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,17 +1068,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1031,93 +1107,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1228,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,25 +1237,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1219,8 +1271,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1290,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1397,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index d658e0394e..d3f0d16633 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 5; //static const int ncomb = 32; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index e36675626f..a246f2aab0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 61bb13c3e7..d50f96bb8d 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagrams.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagrams.h new file mode 100644 index 0000000000..efb8277d2c --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagrams.h @@ -0,0 +1,174 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 5 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); + FFV1_2( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 5 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1P0_3( w_fp[4], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 5 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 5 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 5 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f index c2eadb2c31..10332b6238 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f index c8fbbe9e22..cc34d12626 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -341,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -387,7 +387,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -430,31 +431,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,2,4) T(3,5) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,2,5) T(3,4) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,3,4) T(2,5) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,3,5) T(2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -507,10 +505,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -519,6 +519,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/cluster.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data b/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/gq_ttq.mad/bin/madevent b/epochX/cudacpp/gq_ttq.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/madevent +++ b/epochX/cudacpp/gq_ttq.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h index a304fc85c8..616eab36fd 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -948,7 +948,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -961,7 +961,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -980,7 +980,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -993,7 +993,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1012,7 +1012,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1025,7 +1025,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1043,7 +1043,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -1056,7 +1056,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc index 998cb505a0..1dfc54c553 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index 1565ed5888..a4ace231d8 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -300,7 +300,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -310,10 +310,10 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); GC_11s_sv = couplings_sv.GC_11; GC_10s_sv = couplings_sv.GC_10; mgDebug( 1, __FUNCTION__ ); diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 8249ac5d67..948a9250a9 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006358146667480469  +DEBUG: model prefixing takes 0.005482912063598633  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -166,13 +166,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.078 s +8 processes with 40 diagrams generated in 0.079 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -184,45 +184,45 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=1 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=1 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.145 s +ALOHA: aloha creates 2 routines in 0.141 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.670s -user 0m0.588s -sys 0m0.061s -Code generation completed in 1 seconds +real 0m0.684s +user 0m0.589s +sys 0m0.066s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index 81ab8669a5..1ee522dbfd 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,328 +282,143 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 5 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - FFV1_2( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 2 OF 5 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1P0_3( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 3 OF 5 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 5 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 5 OF 5 *** - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 5 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -605,7 +473,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -639,6 +511,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -680,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -782,26 +662,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -809,25 +689,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -972,13 +1056,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -990,17 +1068,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1026,93 +1107,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1154,7 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1177,7 +1228,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1186,25 +1237,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1214,8 +1271,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1231,11 +1290,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1337,14 +1397,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index b501a9772e..3956ab144a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 5; //static const int ncomb = 32; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagrams.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagrams.h new file mode 100644 index 0000000000..186e3362ee --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagrams.h @@ -0,0 +1,169 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 5 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + FFV1_2( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 5 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1P0_3( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 5 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 5 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 5 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index c1c42990a2..50d05d273c 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,328 +282,143 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 5 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); - - FFV1_2( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 2 OF 5 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1P0_3( w_fp[4], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 3 OF 5 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 4 OF 5 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 5 OF 5 *** - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 5 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -605,7 +473,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -639,6 +511,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -680,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -782,26 +662,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -809,25 +689,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -972,13 +1056,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -990,17 +1068,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1026,93 +1107,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1154,7 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1177,7 +1228,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1186,25 +1237,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1214,8 +1271,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1231,11 +1290,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1337,14 +1397,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index d658e0394e..d3f0d16633 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 5; //static const int ncomb = 32; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagrams.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagrams.h new file mode 100644 index 0000000000..cf3655728d --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagrams.h @@ -0,0 +1,169 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 5 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); + FFV1_2( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 5 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1P0_3( w_fp[4], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 5 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 5 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 5 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h index a304fc85c8..616eab36fd 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -948,7 +948,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -961,7 +961,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -980,7 +980,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -993,7 +993,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1012,7 +1012,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1025,7 +1025,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1043,7 +1043,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -1056,7 +1056,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc index 998cb505a0..1dfc54c553 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index 1565ed5888..a4ace231d8 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -300,7 +300,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -310,10 +310,10 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); GC_11s_sv = couplings_sv.GC_11; GC_10s_sv = couplings_sv.GC_10; mgDebug( 1, __FUNCTION__ ); diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index c46ef95a65..56d680a7fe 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -128,16 +128,16 @@ Total: 1 processes with 4 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -149,59 +149,59 @@ FileWriter b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1665]  Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s -Wrote files for 12 helas calls in 0.076 s +Wrote files for 12 helas calls in 0.077 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.266 s +ALOHA: aloha creates 4 routines in 0.257 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.250 s +ALOHA: aloha creates 8 routines in 0.240 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses/P1_gg_bbx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses/P1_gg_bbx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m2.141s -user 0m1.860s -sys 0m0.270s +real 0m2.155s +user 0m1.886s +sys 0m0.264s Code generation completed in 2 seconds ************************************************************ * * @@ -215,7 +215,7 @@ Code generation completed in 2 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -223,9 +223,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -245,7 +245,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -253,9 +253,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat index 92581deeee..8b5e2d66c2 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat index 8af20dc4e4..3802880982 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat index 0815703ee4..6917ce597f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt b/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f b/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc b/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/makefile b/epochX/cudacpp/heft_gg_bb.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/makefile +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc b/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessCouplings.h index e98a172df1..4e154ca3bc 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h index 90075da66e..110b93643f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_heft_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc index 5d6a4e1f06..1f301c5523 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_heft.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_heft_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_heft_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 3; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,311 +279,141 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 4 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - VVS3_3( w_fp[0], w_fp[1], COUPs[0], 1.0, cIPD[1], cIPD[2], w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFS2_0( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[2] -= 2. * amp_sv[0]; - - // *** DIAGRAM 2 OF 4 *** - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 4 *** - - // Wavefunction(s) for diagram number 3 - FFV1_1( w_fp[2], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 4 OF 4 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[3], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] ); - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_bbx()?) - - // The color denominators (initialize all array elements, with ncolor=3) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 1 }; // 1-D array[3] - - // The color matrix (initialize all array elements, with ncolor=3) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2, 6 }, - { -2, 16, 6 }, - { 2, 2, 6 } }; // 2-D array[3][3] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 4 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -569,7 +452,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -602,6 +489,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MB ); m_masses.push_back( m_pars->mdl_MB ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MB, (fptype)m_pars->mdl_MH, (fptype)m_pars->mdl_WH }; @@ -643,6 +534,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_heft::ZERO ); m_masses.push_back( Parameters_heft::mdl_MB ); m_masses.push_back( Parameters_heft::mdl_MB ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -745,26 +640,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -772,25 +667,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -928,20 +1027,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -953,17 +1046,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -989,93 +1085,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1117,7 +1183,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1140,7 +1206,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1149,25 +1215,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1177,8 +1249,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1194,11 +1268,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1300,14 +1375,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h index 30c5663297..c519f81e85 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_heft.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 4; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 3; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 4; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f index 0b39d55964..252d00c684 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f index c57e06d578..3fef361dd9 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc new file mode 100644 index 0000000000..6b493df318 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc @@ -0,0 +1,384 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=3) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 1 }; // 1-D array[3] + + // The color matrix (initialize all array elements, with ncolor=3) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2, 6 }, + { -2, 16, 6 }, + { 2, 2, 6 } }; // 2-D array[3][3] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagram_boilerplate.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagrams.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagrams.h new file mode 100644 index 0000000000..24cc14c39e --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagrams.h @@ -0,0 +1,138 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 4 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + VVS3_3( w_fp[0], w_fp[1], COUPs[0], 1.0, cIPD[1], cIPD[2], w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFS2_0( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 4 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 4 *** + // Wavefunction(s) for diagram number 3 + FFV1_1( w_fp[2], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 4 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[3], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f index ec5722702a..c08048ad0e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f index 598338d03e..2a2fccda40 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,26 +394,26 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 3) /5.333333333333333D+00, - $ -6.666666666666666D-01,2.000000000000000D+00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 3) /16,-4,12/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 3) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 4, 5) /16,12/ C 1 T(2,1,3,4) - DATA (CF(I, 3),I= 1, 3) /2.000000000000000D+00 - $ ,2.000000000000000D+00,6.000000000000000D+00/ + DATA (CF(I),I= 6, 6) /18/ C 1 T(3,4) Tr(1,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(MDL_MB - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WH.NE.0D0) FK_MDL_WH = SIGN(MAX(ABS(MDL_WH), ABS(MDL_MH - $ *SMALL_WIDTH_TREATMENT)), MDL_WH) + FK_ZERO = 0D0 + IF(MDL_WH.NE.0D0) THEN + FK_MDL_WH = SIGN(MAX(ABS(MDL_WH), ABS(MDL_MH + $ *SMALL_WIDTH_TREATMENT)), MDL_WH) + ELSE + FK_MDL_WH = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -455,10 +456,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -467,6 +470,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/cluster.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/madevent b/epochX/cudacpp/heft_gg_bb.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/heft_gg_bb.mad/bin/madevent +++ b/epochX/cudacpp/heft_gg_bb.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h index 1b04401547..9e5b1e3584 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], V2[6] - template + template __device__ INLINE void VVS3_3( const fptype allV1[], const fptype allV2[], @@ -873,7 +873,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -886,7 +886,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -924,7 +924,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], S3[6] - template + template __device__ INLINE void FFS2_0( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], V2[6] - template + template __device__ void VVS3_3( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* S3 = W_ACCESS::kernelAccess( allS3 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -970,7 +970,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -983,7 +983,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1008,7 +1008,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1021,7 +1021,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP8 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1033,7 +1033,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1046,7 +1046,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1065,7 +1065,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1078,7 +1078,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1097,7 +1097,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], S3[6] - template + template __device__ void FFS2_0( const fptype allF1[], const fptype allF2[], @@ -1110,7 +1110,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * F2[2] + F1[3] * F2[3] + F1[4] * F2[4] + F1[5] * F2[5] ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc index 0fa5a34cf0..a9b14b3a06 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h index 0faa7bb71e..45c7bd04c2 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -335,7 +335,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -345,12 +345,12 @@ namespace mg5amcCpu using namespace Parameters_heft_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_13s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - cxtype_sv_ref GC_13s_sv = C_ACCESS::kernelAccess( GC_13s ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); + fptype* GC_13s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + cxtype_sv_ref GC_13s_sv = CD_ACCESS::kernelAccess( GC_13s ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); GC_13s_sv = couplings_sv.GC_13; GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 04039fcd14..bfa5f8322b 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,15 +49,20 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft +INFO: load particles +INFO: load vertices +WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +DEBUG: model prefixing takes 0.00581049919128418  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -127,45 +132,45 @@ INFO: Process has 4 diagrams Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.261 s +ALOHA: aloha creates 4 routines in 0.259 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.646s -user 0m0.583s +real 0m0.671s +user 0m0.604s sys 0m0.051s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessCouplings.h index e98a172df1..4e154ca3bc 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h index 90075da66e..110b93643f 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_heft_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc index b9f394434a..1f301c5523 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_heft.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_heft_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_heft_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 3; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#endif + return channelId; + } #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,307 +279,141 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 4 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - VVS3_3( w_fp[0], w_fp[1], COUPs[0], 1.0, cIPD[1], cIPD[2], w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFS2_0( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[2] -= 2. * amp_sv[0]; - - // *** DIAGRAM 2 OF 4 *** - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 4 *** - - // Wavefunction(s) for diagram number 3 - FFV1_1( w_fp[2], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 4 OF 4 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[3], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] ); - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_bbx()?) - - // The color denominators (initialize all array elements, with ncolor=3) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 1 }; // 1-D array[3] - - // The color matrix (initialize all array elements, with ncolor=3) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2, 6 }, - { -2, 16, 6 }, - { 2, 2, 6 } }; // 2-D array[3][3] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 4 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -565,7 +452,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -598,6 +489,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MB ); m_masses.push_back( m_pars->mdl_MB ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MB, (fptype)m_pars->mdl_MH, (fptype)m_pars->mdl_WH }; @@ -639,6 +534,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_heft::ZERO ); m_masses.push_back( Parameters_heft::mdl_MB ); m_masses.push_back( Parameters_heft::mdl_MB ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -741,26 +640,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -768,25 +667,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -931,13 +1034,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -949,17 +1046,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -985,93 +1085,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1113,7 +1183,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1136,7 +1206,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1145,25 +1215,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1173,8 +1249,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1190,11 +1268,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1296,14 +1375,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h index 30c5663297..c519f81e85 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_heft.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 4; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 3; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 4; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc new file mode 100644 index 0000000000..6b493df318 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc @@ -0,0 +1,384 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=3) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 1 }; // 1-D array[3] + + // The color matrix (initialize all array elements, with ncolor=3) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2, 6 }, + { -2, 16, 6 }, + { 2, 2, 6 } }; // 2-D array[3][3] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagram_boilerplate.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagrams.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagrams.h new file mode 100644 index 0000000000..ea1cf69605 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagrams.h @@ -0,0 +1,134 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 4 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + VVS3_3( w_fp[0], w_fp[1], COUPs[0], 1.0, cIPD[1], cIPD[2], w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFS2_0( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 4 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 4 *** + // Wavefunction(s) for diagram number 3 + FFV1_1( w_fp[2], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 4 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[3], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h index 1b04401547..9e5b1e3584 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], V2[6] - template + template __device__ INLINE void VVS3_3( const fptype allV1[], const fptype allV2[], @@ -873,7 +873,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -886,7 +886,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -924,7 +924,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], S3[6] - template + template __device__ INLINE void FFS2_0( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], V2[6] - template + template __device__ void VVS3_3( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* S3 = W_ACCESS::kernelAccess( allS3 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -970,7 +970,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -983,7 +983,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1008,7 +1008,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1021,7 +1021,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP8 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1033,7 +1033,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1046,7 +1046,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1065,7 +1065,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1078,7 +1078,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1097,7 +1097,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], S3[6] - template + template __device__ void FFS2_0( const fptype allF1[], const fptype allF2[], @@ -1110,7 +1110,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * F2[2] + F1[3] * F2[3] + F1[4] * F2[4] + F1[5] * F2[5] ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc index 0fa5a34cf0..a9b14b3a06 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h index 0faa7bb71e..45c7bd04c2 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -335,7 +335,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -345,12 +345,12 @@ namespace mg5amcCpu using namespace Parameters_heft_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_13s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - cxtype_sv_ref GC_13s_sv = C_ACCESS::kernelAccess( GC_13s ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); + fptype* GC_13s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + cxtype_sv_ref GC_13s_sv = CD_ACCESS::kernelAccess( GC_13s ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); GC_13s_sv = couplings_sv.GC_13; GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index 11380fe474..4faaccb09b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +57,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006017446517944336  +DEBUG: model prefixing takes 0.005433559417724609  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -181,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.107 s +4 processes with 8 diagrams generated in 0.110 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -223,21 +223,21 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.640 s +12 processes with 144 diagrams generated in 0.647 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -271,9 +271,9 @@ FileWriter t t~ w+ d WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1665]  INFO: Creating files in directory P1_gd_ttxwmu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -282,9 +282,9 @@ FileWriter t t~ w- u WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1665]  INFO: Creating files in directory P1_gux_ttxwmdx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -293,9 +293,9 @@ FileWriter t t~ w- d~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1665]  INFO: Creating files in directory P1_gdx_ttxwpux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -304,9 +304,9 @@ FileWriter t t~ w+ u~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1665]  INFO: Creating files in directory P1_udx_ttxwpg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -315,9 +315,9 @@ FileWriter t t~ w+ g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1665]  INFO: Creating files in directory P1_dux_ttxwmg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -326,9 +326,9 @@ FileWriter t t~ w- g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1665]  INFO: Creating files in directory P0_udx_ttxwp DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -337,9 +337,9 @@ FileWriter t t~ w+ WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1665]  INFO: Creating files in directory P0_dux_ttxwm DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -348,21 +348,21 @@ FileWriter t t~ w- WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1552]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.202 s -Wrote files for 212 helas calls in 0.830 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1665]  +Generated helas calls for 8 subprocesses (76 diagrams) in 0.200 s +Wrote files for 212 helas calls in 0.918 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.204 s +ALOHA: aloha creates 3 routines in 0.198 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.200 s +ALOHA: aloha creates 6 routines in 0.195 s FFV1 FFV1 FFV1 @@ -370,73 +370,73 @@ ALOHA: aloha creates 6 routines in 0.200 s FFV2 FFV2 VVV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_dux_ttxwm; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_dux_ttxwm; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 72 (offset 1 line). Hunk #2 succeeded at 268 (offset 41 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_udx_ttxwp; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_udx_ttxwp; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 72 (offset 1 line). Hunk #2 succeeded at 268 (offset 41 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_dux_ttxwmg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_dux_ttxwmg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 72 (offset 1 line). Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gd_ttxwmu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gd_ttxwmu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 72 (offset 1 line). Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gdx_ttxwpux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gdx_ttxwpux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 72 (offset 1 line). Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gu_ttxwpd; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gu_ttxwpd; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 72 (offset 1 line). Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gux_ttxwmdx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gux_ttxwmdx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 72 (offset 1 line). Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_udx_ttxwpg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_udx_ttxwpg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 72 (offset 1 line). Hunk #2 succeeded at 316 (offset 89 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m4.658s -user 0m4.105s -sys 0m0.537s +real 0m4.835s +user 0m4.190s +sys 0m0.553s Code generation completed in 5 seconds ************************************************************ * * @@ -450,7 +450,7 @@ Code generation completed in 5 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -458,9 +458,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -480,7 +480,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -488,9 +488,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat index 72b31976a0..9d9e01b7c5 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat index 5eca3e3f2b..48beb899d9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat @@ -127,6 +127,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat index 3b445d02a0..c22a9e0249 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat @@ -127,6 +127,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt b/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f b/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc b/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile b/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc b/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc index 2588190439..e169c1f193 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessCouplings.h index 7b12b981ec..8a49e698cb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h index 1e7cc050f7..129dd0150a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_no_b_mass_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc index 97050f0aa2..6099d099a9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,20 +99,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,57 +167,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -227,288 +280,137 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 7; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - - // *** DIAGRAM 1 OF 2 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); - - FFV2_2( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - // *** DIAGRAM 2 OF 2 *** - - // Wavefunction(s) for diagram number 2 - FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_dux_ttxwm()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 9, 3 }, - { 3, 9 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 2 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -579,7 +481,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -613,6 +519,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -655,6 +565,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -757,26 +671,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -784,25 +698,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -938,22 +1056,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -965,17 +1077,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1001,93 +1116,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1129,7 +1214,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1152,7 +1237,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1161,25 +1246,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1189,8 +1280,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1206,11 +1299,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1312,14 +1406,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h index 9d6c262053..3837ab2e64 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,17 +77,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 48; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 7; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 2; //static const int ncomb = 48; // CPPProcess::ncomb @@ -123,23 +124,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -153,34 +157,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f index 7f7324dc0b..bcd37d9641 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f index 08dd1f728a..27f2a0aae3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) S1=PDG2PDF(LPP(IB(1)),3, IB(1),XBK(IB(1)), QSCALE) @@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE) @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -501,6 +505,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc new file mode 100644 index 0000000000..bbee00495a --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 9, 3 }, + { 3, 9 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagrams.h new file mode 100644 index 0000000000..bb3b936ca7 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagrams.h @@ -0,0 +1,83 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 2 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); + FFV2_2( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 2 *** + // Wavefunction(s) for diagram number 2 + FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f index 531dfa0771..8963914a5c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f index 5c47e1c729..f8745a68c0 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -399,7 +399,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(1) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -444,23 +445,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /9.000000000000000D+00 - $ ,3.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 2) /9,6/ C 1 T(2,1) T(3,4) - DATA (CF(I, 2),I= 1, 2) /3.000000000000000D+00 - $ ,9.000000000000000D+00/ + DATA (CF(I),I= 3, 3) /9/ C 1 T(2,4) T(3,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -497,10 +506,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -509,6 +520,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc index 57246ba1e7..3a07e52836 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,20 +99,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,57 +167,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -227,288 +280,137 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 7; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - - // *** DIAGRAM 1 OF 2 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); - - FFV2_2( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - // *** DIAGRAM 2 OF 2 *** - - // Wavefunction(s) for diagram number 2 - FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_udx_ttxwp()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 9, 3 }, - { 3, 9 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 2 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -579,7 +481,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -613,6 +519,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -655,6 +565,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -757,26 +671,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -784,25 +698,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -938,22 +1056,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -965,17 +1077,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1001,93 +1116,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1129,7 +1214,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1152,7 +1237,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1161,25 +1246,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1189,8 +1280,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1206,11 +1299,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1312,14 +1406,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h index cd8edd3e39..7ffb85326e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,17 +77,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 48; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 7; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 2; //static const int ncomb = 48; // CPPProcess::ncomb @@ -123,23 +124,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -153,34 +157,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f index 2e439af0a3..b5e5d182dd 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f index 0808ce67ce..6abc3b39fd 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) C1=PDG2PDF(LPP(IB(1)),4, IB(1),XBK(IB(1)), QSCALE) @@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE) @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -501,6 +505,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc new file mode 100644 index 0000000000..bbee00495a --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 9, 3 }, + { 3, 9 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagrams.h new file mode 100644 index 0000000000..bb3b936ca7 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagrams.h @@ -0,0 +1,83 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 2 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); + FFV2_2( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 2 *** + // Wavefunction(s) for diagram number 2 + FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f index 531dfa0771..8963914a5c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f index bbf708250a..beb5bf4d5c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -399,7 +399,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(1) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -444,23 +445,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /9.000000000000000D+00 - $ ,3.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 2) /9,6/ C 1 T(2,1) T(3,4) - DATA (CF(I, 2),I= 1, 2) /3.000000000000000D+00 - $ ,9.000000000000000D+00/ + DATA (CF(I),I= 3, 3) /9/ C 1 T(2,4) T(3,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -497,10 +506,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -509,6 +520,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc index 3261780672..6c8a1c5d5b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,20 +99,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,57 +167,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -227,436 +280,157 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 12 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); - - vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - FFV1_2( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); - FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 2 OF 12 *** - - // Wavefunction(s) for diagram number 2 - FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 3 OF 12 *** - - // Wavefunction(s) for diagram number 3 - FFV1_1( w_fp[2], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); - FFV2_2( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 12 *** - - // Wavefunction(s) for diagram number 4 - // (none) - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 5 OF 12 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); - FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 6 OF 12 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 7 OF 12 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 8 OF 12 *** - - // Wavefunction(s) for diagram number 8 - FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 9 OF 12 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[9], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 10 OF 12 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[5], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 12 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 12 *** - - // Wavefunction(s) for diagram number 12 - FFV1_1( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_dux_ttxwmg()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 12 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -775,7 +549,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +588,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +635,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -955,26 +741,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +768,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1136,22 +1126,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,17 +1147,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1199,93 +1186,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1284,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1307,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,25 +1316,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1387,8 +1350,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1369,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1476,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h index ecb184f729..4dd1c0e001 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,17 +77,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 12; //static const int ncomb = 96; // CPPProcess::ncomb @@ -123,23 +124,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -153,34 +157,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f index 26d6979a1d..8840068613 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f index 330b566ed8..110863be58 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) S1=PDG2PDF(LPP(IB(1)),3, IB(1),XBK(IB(1)), QSCALE) @@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE) @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -501,6 +505,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagrams.h new file mode 100644 index 0000000000..81fbede8ee --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagrams.h @@ -0,0 +1,388 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 12 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); + vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + FFV1_2( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); + FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 12 *** + // Wavefunction(s) for diagram number 2 + FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 12 *** + // Wavefunction(s) for diagram number 3 + FFV1_1( w_fp[2], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); + FFV2_2( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 12 *** + // Wavefunction(s) for diagram number 4 + // (none) + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 12 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); + FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 12 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 12 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 12 *** + // Wavefunction(s) for diagram number 8 + FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 12 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[9], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 12 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[5], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 12 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 12 *** + // Wavefunction(s) for diagram number 12 + FFV1_1( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f index d8518f17f7..21de29dc55 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f index 4b8ccfcacb..1d1624ea30 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(2,1) T(6,3,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(2,4) T(6,3,1) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(3,1) T(6,2,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(3,4) T(6,2,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc index c933a8f276..64b881af2b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,20 +99,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,57 +167,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -227,436 +280,157 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 12 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); - - oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - FFV1_2( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); - FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 2 OF 12 *** - - // Wavefunction(s) for diagram number 2 - FFV2_1( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 3 OF 12 *** - - // Wavefunction(s) for diagram number 3 - FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); - FFV2_2( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 4 OF 12 *** - - // Wavefunction(s) for diagram number 4 - // (none) - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 5 OF 12 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); - FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 6 OF 12 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 7 OF 12 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 8 OF 12 *** - - // Wavefunction(s) for diagram number 8 - FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 9 OF 12 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 10 OF 12 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 12 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 12 *** - - // Wavefunction(s) for diagram number 12 - FFV1_1( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gd_ttxwmu()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 12 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -775,7 +549,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +588,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +635,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -955,26 +741,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +768,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1136,22 +1126,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,17 +1147,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1199,93 +1186,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1284,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1307,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,25 +1316,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1387,8 +1350,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1369,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1476,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h index a5c44d3213..4d52728816 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,17 +77,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 12; //static const int ncomb = 96; // CPPProcess::ncomb @@ -123,23 +124,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -153,34 +157,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f index 3779397ce4..307fe1f6cf 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f index 1dae307565..ac0a02a97d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) S2=PDG2PDF(LPP(IB(2)),3, IB(2),XBK(IB(2)), QSCALE) @@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -498,6 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagrams.h new file mode 100644 index 0000000000..24865a9858 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagrams.h @@ -0,0 +1,388 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 12 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); + oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + FFV1_2( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); + FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 12 *** + // Wavefunction(s) for diagram number 2 + FFV2_1( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 12 *** + // Wavefunction(s) for diagram number 3 + FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); + FFV2_2( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 12 *** + // Wavefunction(s) for diagram number 4 + // (none) + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 12 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); + FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 12 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 12 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 12 *** + // Wavefunction(s) for diagram number 8 + FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 12 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 12 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 12 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 12 *** + // Wavefunction(s) for diagram number 12 + FFV1_1( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f index d8518f17f7..21de29dc55 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f index a3a57cd8b8..af49c9f60d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,3,2) T(6,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,3,4) T(6,2) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,6,2) T(3,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,6,4) T(3,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc index 6f1f37d1eb..b7d2cbe189 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,20 +99,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,57 +167,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -227,436 +280,157 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 12 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - FFV1_2( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); - FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 2 OF 12 *** - - // Wavefunction(s) for diagram number 2 - FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 3 OF 12 *** - - // Wavefunction(s) for diagram number 3 - FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); - FFV2_2( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 12 *** - - // Wavefunction(s) for diagram number 4 - // (none) - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 5 OF 12 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); - FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 6 OF 12 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 7 OF 12 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 8 OF 12 *** - - // Wavefunction(s) for diagram number 8 - FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 9 OF 12 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 10 OF 12 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 12 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 12 *** - - // Wavefunction(s) for diagram number 12 - FFV1_1( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gdx_ttxwpux()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 12 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -775,7 +549,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +588,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +635,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -955,26 +741,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +768,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1136,22 +1126,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,17 +1147,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1199,93 +1186,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1284,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1307,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,25 +1316,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1387,8 +1350,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1369,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1476,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h index d0dd16c512..47a8d7c2ca 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,17 +77,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 12; //static const int ncomb = 96; // CPPProcess::ncomb @@ -123,23 +124,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -153,34 +157,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f index 7c1bbde100..807ce0e6c9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f index ece4509a8c..bf57b49c26 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE) @@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -498,6 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagrams.h new file mode 100644 index 0000000000..a4f3be5d0a --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagrams.h @@ -0,0 +1,388 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 12 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + FFV1_2( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); + FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 12 *** + // Wavefunction(s) for diagram number 2 + FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 12 *** + // Wavefunction(s) for diagram number 3 + FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); + FFV2_2( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 12 *** + // Wavefunction(s) for diagram number 4 + // (none) + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 12 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); + FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 12 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 12 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 12 *** + // Wavefunction(s) for diagram number 8 + FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 12 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 12 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 12 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 12 *** + // Wavefunction(s) for diagram number 12 + FFV1_1( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f index d8518f17f7..21de29dc55 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f index e550640e16..7233e3c74f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,2,4) T(3,6) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,2,6) T(3,4) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,3,4) T(2,6) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,3,6) T(2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc index 16d1e89a53..5cf114c48b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,20 +99,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,57 +167,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -227,436 +280,157 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 12 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); - - oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - FFV1_2( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); - FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 2 OF 12 *** - - // Wavefunction(s) for diagram number 2 - FFV2_1( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 3 OF 12 *** - - // Wavefunction(s) for diagram number 3 - FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); - FFV2_2( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 4 OF 12 *** - - // Wavefunction(s) for diagram number 4 - // (none) - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 5 OF 12 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); - FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 6 OF 12 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 7 OF 12 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 8 OF 12 *** - - // Wavefunction(s) for diagram number 8 - FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 9 OF 12 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 10 OF 12 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 12 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 12 *** - - // Wavefunction(s) for diagram number 12 - FFV1_1( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gu_ttxwpd()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 12 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -775,7 +549,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +588,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +635,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -955,26 +741,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +768,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1136,22 +1126,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,17 +1147,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1199,93 +1186,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1284,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1307,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,25 +1316,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1387,8 +1350,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1369,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1476,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h index f799f32129..15fdb6df3c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,17 +77,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 12; //static const int ncomb = 96; // CPPProcess::ncomb @@ -123,23 +124,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -153,34 +157,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f index e5ddbf348a..180917495b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f index 4ebece2e78..e068973bd6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) C2=PDG2PDF(LPP(IB(2)),4, IB(2),XBK(IB(2)), QSCALE) @@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -498,6 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagrams.h new file mode 100644 index 0000000000..24865a9858 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagrams.h @@ -0,0 +1,388 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 12 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); + oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + FFV1_2( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); + FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 12 *** + // Wavefunction(s) for diagram number 2 + FFV2_1( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 12 *** + // Wavefunction(s) for diagram number 3 + FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); + FFV2_2( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 12 *** + // Wavefunction(s) for diagram number 4 + // (none) + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 12 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); + FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 12 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 12 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 12 *** + // Wavefunction(s) for diagram number 8 + FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 12 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 12 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 12 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 12 *** + // Wavefunction(s) for diagram number 12 + FFV1_1( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f index d8518f17f7..21de29dc55 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f index 738301d049..765b11c693 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,3,2) T(6,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,3,4) T(6,2) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,6,2) T(3,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,6,4) T(3,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc index 41a6e0002f..29d5e3e1ef 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,20 +99,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,57 +167,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -227,436 +280,157 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 12 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - FFV1_2( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); - FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 2 OF 12 *** - - // Wavefunction(s) for diagram number 2 - FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 3 OF 12 *** - - // Wavefunction(s) for diagram number 3 - FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); - FFV2_2( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 12 *** - - // Wavefunction(s) for diagram number 4 - // (none) - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 5 OF 12 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); - FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 6 OF 12 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 7 OF 12 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 8 OF 12 *** - - // Wavefunction(s) for diagram number 8 - FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 9 OF 12 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 10 OF 12 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 12 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 12 *** - - // Wavefunction(s) for diagram number 12 - FFV1_1( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gux_ttxwmdx()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 12 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -775,7 +549,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +588,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +635,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -955,26 +741,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +768,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1136,22 +1126,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,17 +1147,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1199,93 +1186,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1284,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1307,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,25 +1316,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1387,8 +1350,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1369,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1476,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h index b6253b6715..68dd39bcf2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,17 +77,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 12; //static const int ncomb = 96; // CPPProcess::ncomb @@ -123,23 +124,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -153,34 +157,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f index 8e03eed7eb..5be1675ba2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f index 9d0ddcecfc..f2f893278c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE) @@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -498,6 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagrams.h new file mode 100644 index 0000000000..a4f3be5d0a --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagrams.h @@ -0,0 +1,388 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 12 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + FFV1_2( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); + FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 12 *** + // Wavefunction(s) for diagram number 2 + FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 12 *** + // Wavefunction(s) for diagram number 3 + FFV1_1( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); + FFV2_2( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 12 *** + // Wavefunction(s) for diagram number 4 + // (none) + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 12 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); + FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 12 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 12 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 12 *** + // Wavefunction(s) for diagram number 8 + FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 12 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 12 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 12 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 12 *** + // Wavefunction(s) for diagram number 12 + FFV1_1( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f index d8518f17f7..21de29dc55 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f index 6b3ff14d2d..81f884204a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,2,4) T(3,6) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,2,6) T(3,4) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,3,4) T(2,6) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,3,6) T(2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc index f90db593a9..b57648e9f6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm_no_b_mass.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -97,20 +99,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_no_b_mass_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -169,57 +167,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -227,436 +280,157 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 12 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); - - vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - FFV1_2( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); - FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 2 OF 12 *** - - // Wavefunction(s) for diagram number 2 - FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 3 OF 12 *** - - // Wavefunction(s) for diagram number 3 - FFV1_1( w_fp[2], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); - FFV2_2( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 12 *** - - // Wavefunction(s) for diagram number 4 - // (none) - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 5 OF 12 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); - FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 6 OF 12 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 7 OF 12 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 8 OF 12 *** - - // Wavefunction(s) for diagram number 8 - FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 9 OF 12 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[9], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 10 OF 12 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[5], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 12 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 12 *** - - // Wavefunction(s) for diagram number 12 - FFV1_1( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_udx_ttxwpg()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 12 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); -#else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -775,7 +549,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -810,6 +588,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MW ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT }; @@ -853,6 +635,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT ); m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW ); m_masses.push_back( Parameters_sm_no_b_mass::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -955,26 +741,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -982,25 +768,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1136,22 +1126,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1163,17 +1147,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1199,93 +1186,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1327,7 +1284,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1350,7 +1307,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1359,25 +1316,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1387,8 +1350,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1404,11 +1369,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1510,14 +1476,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h index b4a0ccb74d..5e49bb346c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm_no_b_mass.h" #include @@ -76,17 +77,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 12; //static const int ncomb = 96; // CPPProcess::ncomb @@ -123,23 +124,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -153,34 +157,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f index 7e750641c8..4d39b68db0 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f index 28ad0eed08..7cbb2180db 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) C1=PDG2PDF(LPP(IB(1)),4, IB(1),XBK(IB(1)), QSCALE) @@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE) @@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -501,6 +505,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagrams.h new file mode 100644 index 0000000000..81fbede8ee --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagrams.h @@ -0,0 +1,388 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 12 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 ); + vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + FFV1_2( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] ); + FFV2_2( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 12 *** + // Wavefunction(s) for diagram number 2 + FFV2_1( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 12 *** + // Wavefunction(s) for diagram number 3 + FFV1_1( w_fp[2], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] ); + FFV2_2( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 12 *** + // Wavefunction(s) for diagram number 4 + // (none) + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 12 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] ); + FFV1P0_3( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 12 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 12 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 12 *** + // Wavefunction(s) for diagram number 8 + FFV2_1( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 12 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[9], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 12 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[5], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 12 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 12 *** + // Wavefunction(s) for diagram number 12 + FFV1_1( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f index d8518f17f7..21de29dc55 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f index 536bec2827..fecf2b47e6 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(7) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(2,1) T(6,3,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(2,4) T(6,3,1) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(3,1) T(6,2,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(3,4) T(6,2,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) - IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW - $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + + IF(MDL_WW.NE.0D0) THEN + FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW + $ *SMALL_WIDTH_TREATMENT)), MDL_WW) + ELSE + FK_MDL_WW = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/cluster.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent b/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent +++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h index 850b86e0e6..12a9428e02 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV2_1( const fptype allF2[], const fptype allV3[], @@ -924,7 +924,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV2_2( const fptype allF1[], const fptype allV3[], @@ -937,7 +937,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -950,7 +950,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -963,7 +963,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -975,7 +975,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -988,7 +988,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1007,7 +1007,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1020,7 +1020,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1039,7 +1039,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1052,7 +1052,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1070,7 +1070,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV2_1( const fptype allF2[], const fptype allV3[], @@ -1083,7 +1083,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1102,7 +1102,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV2_2( const fptype allF1[], const fptype allV3[], @@ -1115,7 +1115,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1134,7 +1134,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1147,7 +1147,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc index d799b19eeb..39a3bb9a4b 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h index e448052141..3a15c28ac3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -295,7 +295,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -305,10 +305,10 @@ namespace mg5amcCpu using namespace Parameters_sm_no_b_mass_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); GC_11s_sv = couplings_sv.GC_11; GC_10s_sv = couplings_sv.GC_10; mgDebug( 1, __FUNCTION__ ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 0a0d056033..5c42a0cf8e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +57,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0064830780029296875  +DEBUG: model prefixing takes 0.005420684814453125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -208,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.135 s +13 processes with 76 diagrams generated in 0.138 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -374,21 +374,21 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.933 s +65 processes with 1119 diagrams generated in 1.838 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -499,9 +499,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1665]  INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -510,9 +510,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1665]  INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -521,9 +521,9 @@ FileWriter t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1665]  INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -532,9 +532,9 @@ FileWriter t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1665]  INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -543,9 +543,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1665]  INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -554,9 +554,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1665]  INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -565,9 +565,9 @@ FileWriter t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1665]  INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -576,9 +576,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1665]  INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -587,9 +587,9 @@ FileWriter t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1665]  INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -598,9 +598,9 @@ FileWriter t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1665]  INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -609,9 +609,9 @@ FileWriter t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1665]  INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -620,9 +620,9 @@ FileWriter t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1665]  INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -631,9 +631,9 @@ FileWriter t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1665]  INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -642,9 +642,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1665]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -653,9 +653,9 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1665]  INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -664,9 +664,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1665]  INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -675,9 +675,9 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1665]  INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -686,25 +686,25 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1552]  -Generated helas calls for 18 subprocesses (372 diagrams) in 1.286 s -Wrote files for 810 helas calls in 2.762 s +DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1665]  +Generated helas calls for 18 subprocesses (372 diagrams) in 1.292 s +Wrote files for 810 helas calls in 2.897 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.340 s +ALOHA: aloha creates 5 routines in 0.334 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.315 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -717,120 +717,120 @@ ALOHA: aloha creates 10 routines in 0.315 s VVVV3 VVVV4 VVVV4 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_uux_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_uux_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 230 (offset 3 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 243 (offset 16 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 246 (offset 19 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 246 (offset 19 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_uux_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_uux_ttxg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 246 (offset 19 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 275 (offset 48 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxuux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gu_ttxgu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gu_ttxgu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gux_ttxgux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gux_ttxgux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uc_ttxuc; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uc_ttxuc; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 76 (offset 5 lines). Hunk #2 succeeded at 280 (offset 53 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_ucx_ttxucx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_ucx_ttxucx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 82 (offset 11 lines). Hunk #2 succeeded at 286 (offset 59 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uu_ttxuu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uu_ttxuu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxccx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxccx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 82 (offset 11 lines). Hunk #2 succeeded at 286 (offset 59 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxgg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxuux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxcx_ttxuxcx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxcx_ttxuxcx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 76 (offset 5 lines). Hunk #2 succeeded at 280 (offset 53 lines). -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxux_ttxuxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxux_ttxuxux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #1 succeeded at 74 (offset 3 lines). Hunk #2 succeeded at 278 (offset 51 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m11.258s -user 0m9.633s -sys 0m0.984s -Code generation completed in 12 seconds +real 0m10.785s +user 0m9.659s +sys 0m0.978s +Code generation completed in 11 seconds ************************************************************ * * * W E L C O M E to * @@ -843,7 +843,7 @@ Code generation completed in 12 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -851,9 +851,9 @@ Code generation completed in 12 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -873,7 +873,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -881,9 +881,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat index 33311e49bc..de218516de 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat index 5eb60f35df..fe9c38d826 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat @@ -125,6 +125,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat index 38810a6b83..0185201786 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat @@ -125,6 +125,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt +++ b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts +++ b/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f b/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc b/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc +++ b/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts +++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/makefile b/epochX/cudacpp/pp_tt012j.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/makefile +++ b/epochX/cudacpp/pp_tt012j.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc b/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc index 2588190439..e169c1f193 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc +++ b/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h index 55504a2b90..caee99a7fd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h index 65a101888d..c8db607db6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index a17c5f1eef..4dfd5786fe 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,297 +279,139 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // *** DIAGRAM 1 OF 3 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 3 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= amp_sv[0]; - // *** DIAGRAM 3 OF 3 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 3 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -555,7 +450,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -588,6 +487,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -628,6 +531,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -730,26 +637,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -757,25 +664,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -913,20 +1024,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -938,17 +1043,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -974,93 +1082,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1102,7 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1125,7 +1203,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1134,25 +1212,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1162,8 +1246,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1179,11 +1265,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1285,14 +1372,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 2d89e0e244..69c201bb45 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 3; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f index 19278bca59..2db2eb3c0c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f index 42cc7c9d61..25e5bf68ee 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b96d73fb5c --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagrams.h new file mode 100644 index 0000000000..962978409f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagrams.h @@ -0,0 +1,109 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 3 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 3 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 3 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f index ec5722702a..c08048ad0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f index ca1785b808..7ccccfd4a5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,21 +394,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -446,10 +450,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -458,6 +464,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index 0979455d7a..38c869f74d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,271 +282,135 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) + + // ----------------- + // --- COUPLINGS --- + // ----------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; +#else + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif + const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events -#ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // CUDA kernels take input/output buffers with momenta/MEs for all events + + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); - const fptype* COUPs[nxcoup]; - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 1 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_uux_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 9, 3 }, - { 3, 9 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 1 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -532,7 +449,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -565,6 +486,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -605,6 +530,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -707,26 +636,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -734,25 +663,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -888,22 +1021,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -915,17 +1042,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -951,93 +1081,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1079,7 +1179,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1102,7 +1202,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1111,25 +1211,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1139,8 +1245,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1156,11 +1264,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1262,14 +1371,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index d6fa3205c0..8c4c55deaa 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 1; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 1; //static const int ncomb = 16; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f index 6558c40922..c954e28a21 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f index 86f844defe..f2f0bfb2b5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc new file mode 100644 index 0000000000..bbee00495a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 9, 3 }, + { 3, 9 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagrams.h new file mode 100644 index 0000000000..f7b6636999 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagrams.h @@ -0,0 +1,51 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 1 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f index ec5722702a..c08048ad0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f index ec88a303fa..59b33b94c6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -310,7 +310,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -356,7 +356,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -399,21 +400,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /9.000000000000000D+00 - $ ,3.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 2) /9,6/ C 1 T(2,1) T(3,4) - DATA (CF(I, 2),I= 1, 2) /3.000000000000000D+00 - $ ,9.000000000000000D+00/ + DATA (CF(I),I= 3, 3) /9/ C 1 T(2,4) T(3,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -444,10 +448,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -456,6 +462,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 5de1c626c8..1db10f1e09 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,500 +279,165 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 16 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 2 OF 16 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 16 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 16 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 5 OF 16 *** - - // Wavefunction(s) for diagram number 5 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 16 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 7 OF 16 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 8 OF 16 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 16 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 10 OF 16 *** - - // Wavefunction(s) for diagram number 10 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 16 *** - - // Wavefunction(s) for diagram number 11 - // (none) - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 16 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 13 OF 16 *** - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 14 OF 16 *** - - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 15 OF 16 *** - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 16 OF 16 *** - - // Wavefunction(s) for diagram number 16 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 64, -8, -8, 1, 1, 10 }, - { -8, 64, 1, 10, -8, 1 }, - { -8, 1, 64, -8, 10, 1 }, - { 1, 10, -8, 64, 1, -8 }, - { 1, -8, 10, 1, 64, -8 }, - { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 16 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -774,7 +492,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -808,6 +530,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -849,6 +575,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -951,26 +681,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -978,25 +708,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1134,20 +1068,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1159,17 +1087,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1195,93 +1126,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1323,7 +1224,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1346,7 +1247,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1355,25 +1256,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1383,8 +1290,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1400,11 +1309,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1506,14 +1416,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 2acfa000a7..8e87baf8e2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 18; //static const int ncomb = 32; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f index 10496aa04d..163076da52 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f index 7c8695090c..bc9333bb5d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc new file mode 100644 index 0000000000..b76aa16029 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 64, -8, -8, 1, 1, 10 }, + { -8, 64, 1, 10, -8, 1 }, + { -8, 1, 64, -8, 10, 1 }, + { 1, 10, -8, 64, 1, -8 }, + { 1, -8, 10, 1, 64, -8 }, + { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagrams.h new file mode 100644 index 0000000000..8ea15aedfa --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagrams.h @@ -0,0 +1,515 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 16 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 16 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 16 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 16 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 16 *** + // Wavefunction(s) for diagram number 5 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 16 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 16 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 16 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 16 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 16 *** + // Wavefunction(s) for diagram number 10 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 16 *** + // Wavefunction(s) for diagram number 11 + // (none) + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 16 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 16 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 16 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 16 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 16 *** + // Wavefunction(s) for diagram number 16 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f index c2eadb2c31..10332b6238 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f index 797b19405d..850d121618 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -323,7 +323,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -366,7 +366,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(9) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -409,43 +410,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /7.111111111111111D+00, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D - $ +00/ + DATA DENOM/9/ + DATA (CF(I),I= 1, 6) /64,-16,-16,2,2,20/ C 1 T(1,2,5,3,4) - DATA (CF(I, 2),I= 1, 6) /-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D - $ +00,-8.888888888888888D-01,1.111111111111111D-01/ + DATA (CF(I),I= 7, 11) /64,2,20,-16,2/ C 1 T(1,5,2,3,4) - DATA (CF(I, 3),I= 1, 6) /-8.888888888888888D-01 - $ ,1.111111111111111D-01,7.111111111111111D+00, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01/ + DATA (CF(I),I= 12, 15) /64,-16,20,2/ C 1 T(2,1,5,3,4) - DATA (CF(I, 4),I= 1, 6) /1.111111111111111D-01 - $ ,1.111111111111111D+00,-8.888888888888888D-01 - $ ,7.111111111111111D+00,1.111111111111111D-01, - $ -8.888888888888888D-01/ + DATA (CF(I),I= 16, 18) /64,2,-16/ C 1 T(2,5,1,3,4) - DATA (CF(I, 5),I= 1, 6) /1.111111111111111D-01, - $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D - $ -01,7.111111111111111D+00,-8.888888888888888D-01/ + DATA (CF(I),I= 19, 20) /64,-16/ C 1 T(5,1,2,3,4) - DATA (CF(I, 6),I= 1, 6) /1.111111111111111D+00 - $ ,1.111111111111111D-01,1.111111111111111D-01, - $ -8.888888888888888D-01,-8.888888888888888D-01 - $ ,7.111111111111111D+00/ + DATA (CF(I),I= 21, 21) /64/ C 1 T(5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -549,10 +539,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -561,6 +553,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 4f8f49270b..1ee522dbfd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,333 +282,143 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 5 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - FFV1_2( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 2 OF 5 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1P0_3( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 3 OF 5 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 5 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 5 OF 5 *** - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 5 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -610,7 +473,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +511,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -787,26 +662,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +689,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -968,22 +1047,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,17 +1068,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1031,93 +1107,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1228,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,25 +1237,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1219,8 +1271,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1290,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1397,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index b501a9772e..3956ab144a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 5; //static const int ncomb = 32; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f index b0cc58e89c..2d49642e74 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f index 2b281a8200..a45203b57e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagrams.h new file mode 100644 index 0000000000..034f15c587 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagrams.h @@ -0,0 +1,174 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 5 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + FFV1_2( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 5 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1P0_3( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 5 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 5 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 5 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f index c2eadb2c31..10332b6238 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f index 9394a561b8..2efe3ea8fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -326,7 +326,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -372,7 +372,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -415,31 +416,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,3,2) T(5,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,3,4) T(5,2) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,5,2) T(3,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,5,4) T(3,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -492,10 +490,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -504,6 +504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index e2d65a2667..50d05d273c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,333 +282,143 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 5 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); - - FFV1_2( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 2 OF 5 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1P0_3( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 3 OF 5 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 4 OF 5 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 5 OF 5 *** - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 5 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -610,7 +473,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +511,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -787,26 +662,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +689,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -968,22 +1047,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,17 +1068,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1031,93 +1107,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1228,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,25 +1237,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1219,8 +1271,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1290,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1397,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index d658e0394e..d3f0d16633 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 5; //static const int ncomb = 32; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f index e36675626f..a246f2aab0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f index 61bb13c3e7..d50f96bb8d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagrams.h new file mode 100644 index 0000000000..d44286c433 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagrams.h @@ -0,0 +1,174 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 5 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); + FFV1_2( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 5 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1P0_3( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 5 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 5 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 5 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f index c2eadb2c31..10332b6238 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f index c7fdad381b..0a96a485e5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -326,7 +326,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -372,7 +372,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -415,31 +416,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(1,2,4) T(3,5) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(1,2,5) T(3,4) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(1,3,4) T(2,5) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(1,3,5) T(2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -492,10 +490,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -504,6 +504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 4f41927bc9..b884fba722 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 4; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,333 +282,143 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 5 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - FFV1_2( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[5] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 2 OF 5 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 3 OF 5 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 5 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[5] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 5 OF 5 *** - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - VVV1_0( w_fp[4], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_uux_ttxg()?) - - // The color denominators (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] - - // The color matrix (initialize all array elements, with ncolor=4) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 12, 4, 4, 0 }, - { 4, 12, 0, 4 }, - { 4, 0, 12, 4 }, - { 0, 4, 4, 12 } }; // 2-D array[4][4] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 5 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -610,7 +473,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -644,6 +511,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -685,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -787,26 +662,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -814,25 +689,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -968,22 +1047,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -995,17 +1068,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1031,93 +1107,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1159,7 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1182,7 +1228,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1191,25 +1237,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1219,8 +1271,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1236,11 +1290,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1342,14 +1397,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index ebf14aca9e..13e15fe5b2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 5; //static const int ncomb = 32; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f index d46dad4fcb..bfdf29ed05 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f index d8e94d91bb..f1a7d1c5bc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc new file mode 100644 index 0000000000..389d8f6535 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc @@ -0,0 +1,385 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4] + + // The color matrix (initialize all array elements, with ncolor=4) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 12, 4, 4, 0 }, + { 4, 12, 0, 4 }, + { 4, 0, 12, 4 }, + { 0, 4, 4, 12 } }; // 2-D array[4][4] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagrams.h new file mode 100644 index 0000000000..4281eca976 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagrams.h @@ -0,0 +1,174 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 5 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + FFV1_2( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[5] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 5 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 5 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 5 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[5] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 5 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + VVV1_0( w_fp[4], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f index c2eadb2c31..10332b6238 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f index 787dae76b2..c03378a882 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -326,7 +326,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -372,7 +372,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -415,31 +416,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 4) /1.200000000000000D+01 - $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D - $ +00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 4) /12,8,8,0/ C 1 T(2,1) T(5,3,4) - DATA (CF(I, 2),I= 1, 4) /4.000000000000000D+00 - $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D - $ +00/ + DATA (CF(I),I= 5, 7) /12,0,8/ C 1 T(2,4) T(5,3,1) - DATA (CF(I, 3),I= 1, 4) /4.000000000000000D+00 - $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D - $ +00/ + DATA (CF(I),I= 8, 9) /12,8/ C 1 T(3,1) T(5,2,4) - DATA (CF(I, 4),I= 1, 4) /0.000000000000000D+00 - $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D - $ +01/ + DATA (CF(I),I= 10, 10) /12/ C 1 T(3,4) T(5,2,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -492,10 +490,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -504,6 +504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index da962495fd..63e8317212 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 24; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,2412 +279,379 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 123 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - - // Amplitude(s) for diagram number 1 - VVVV1_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 123 *** - - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 123 *** - - // Wavefunction(s) for diagram number 3 - VVV1P0_1( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 123 *** - - // Wavefunction(s) for diagram number 4 - VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 4 - VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 123 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 123 *** - - // Wavefunction(s) for diagram number 6 - // (none) - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 7 OF 123 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 8 OF 123 *** - - // Wavefunction(s) for diagram number 8 - FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 123 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 10 OF 123 *** - - // Wavefunction(s) for diagram number 10 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 123 *** - - // Wavefunction(s) for diagram number 11 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 12 OF 123 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 13 OF 123 *** - - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 14 OF 123 *** - - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 15 OF 123 *** - - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - - // *** DIAGRAM 16 OF 123 *** - - // Wavefunction(s) for diagram number 16 - // (none) - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 17 OF 123 *** - - // Wavefunction(s) for diagram number 17 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1_1( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= amp_sv[0]; - - // *** DIAGRAM 18 OF 123 *** - - // Wavefunction(s) for diagram number 18 - FFV1_1( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 18 - FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 19 OF 123 *** - - // Wavefunction(s) for diagram number 19 - // (none) - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 20 OF 123 *** - - // Wavefunction(s) for diagram number 20 - VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 21 OF 123 *** - - // Wavefunction(s) for diagram number 21 - // (none) - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 22 OF 123 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 23 OF 123 *** - - // Wavefunction(s) for diagram number 23 - VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] ); - - // Amplitude(s) for diagram number 23 - VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[3] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 24 OF 123 *** - - // Wavefunction(s) for diagram number 24 - // (none) - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 25 OF 123 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 26 OF 123 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** DIAGRAM 27 OF 123 *** - - // Wavefunction(s) for diagram number 27 - // (none) - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= amp_sv[0]; - - // *** DIAGRAM 28 OF 123 *** - - // Wavefunction(s) for diagram number 28 - // (none) - - // Amplitude(s) for diagram number 28 - FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= amp_sv[0]; - - // *** DIAGRAM 29 OF 123 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= amp_sv[0]; - - // *** DIAGRAM 30 OF 123 *** - - // Wavefunction(s) for diagram number 30 - // (none) - - // Amplitude(s) for diagram number 30 - FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 31 OF 123 *** - - // Wavefunction(s) for diagram number 31 - // (none) - - // Amplitude(s) for diagram number 31 - VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - - // *** DIAGRAM 32 OF 123 *** - - // Wavefunction(s) for diagram number 32 - VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); - VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] ); - VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[1] -= amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[5] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[2] += amp_sv[0]; - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - - // *** DIAGRAM 33 OF 123 *** - - // Wavefunction(s) for diagram number 33 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 34 OF 123 *** - - // Wavefunction(s) for diagram number 34 - FFV1_2( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 34 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] -= amp_sv[0]; - - // *** DIAGRAM 35 OF 123 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 36 OF 123 *** - - // Wavefunction(s) for diagram number 36 - FFV1P0_3( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 36 - VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 37 OF 123 *** - - // Wavefunction(s) for diagram number 37 - // (none) - - // Amplitude(s) for diagram number 37 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 38 OF 123 *** - - // Wavefunction(s) for diagram number 38 - // (none) - - // Amplitude(s) for diagram number 38 - FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 39 OF 123 *** - - // Wavefunction(s) for diagram number 39 - // (none) - - // Amplitude(s) for diagram number 39 - VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[11] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 40 OF 123 *** - // Wavefunction(s) for diagram number 40 - // (none) - - // Amplitude(s) for diagram number 40 - FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 41 OF 123 *** - - // Wavefunction(s) for diagram number 41 - // (none) - - // Amplitude(s) for diagram number 41 - FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 42 OF 123 *** - - // Wavefunction(s) for diagram number 42 - FFV1_2( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 42 - FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 43 OF 123 *** - - // Wavefunction(s) for diagram number 43 - // (none) - - // Amplitude(s) for diagram number 43 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[15] -= amp_sv[0]; - - // *** DIAGRAM 44 OF 123 *** - - // Wavefunction(s) for diagram number 44 - // (none) - - // Amplitude(s) for diagram number 44 - FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 45 OF 123 *** - - // Wavefunction(s) for diagram number 45 - // (none) - - // Amplitude(s) for diagram number 45 - FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[21] -= amp_sv[0]; - - // *** DIAGRAM 46 OF 123 *** - - // Wavefunction(s) for diagram number 46 - // (none) - - // Amplitude(s) for diagram number 46 - FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 47 OF 123 *** - - // Wavefunction(s) for diagram number 47 - // (none) - - // Amplitude(s) for diagram number 47 - VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 48 OF 123 *** - - // Wavefunction(s) for diagram number 48 - // (none) - - // Amplitude(s) for diagram number 48 - FFV1_0( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[11] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[9] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 49 OF 123 *** - - // Wavefunction(s) for diagram number 49 - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] ); - FFV1_2( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); - - // Amplitude(s) for diagram number 49 - FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 50 OF 123 *** - - // Wavefunction(s) for diagram number 50 - VVV1P0_1( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 50 - FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 51 OF 123 *** - - // Wavefunction(s) for diagram number 51 - // (none) - - // Amplitude(s) for diagram number 51 - FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 52 OF 123 *** - - // Wavefunction(s) for diagram number 52 - FFV1_1( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 52 - FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 53 OF 123 *** - - // Wavefunction(s) for diagram number 53 - // (none) - - // Amplitude(s) for diagram number 53 - FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 54 OF 123 *** - - // Wavefunction(s) for diagram number 54 - // (none) - - // Amplitude(s) for diagram number 54 - FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 55 OF 123 *** - - // Wavefunction(s) for diagram number 55 - // (none) - - // Amplitude(s) for diagram number 55 - FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[3] -= amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - - // *** DIAGRAM 56 OF 123 *** - - // Wavefunction(s) for diagram number 56 - // (none) - - // Amplitude(s) for diagram number 56 - FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - - // *** DIAGRAM 57 OF 123 *** - - // Wavefunction(s) for diagram number 57 - // (none) - - // Amplitude(s) for diagram number 57 - VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 58 OF 123 *** - - // Wavefunction(s) for diagram number 58 - // (none) - - // Amplitude(s) for diagram number 58 - VVVV1_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 59 OF 123 *** - - // Wavefunction(s) for diagram number 59 - VVV1P0_1( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 59 - VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 60 OF 123 *** - - // Wavefunction(s) for diagram number 60 - // (none) - - // Amplitude(s) for diagram number 60 - VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 61 OF 123 *** - - // Wavefunction(s) for diagram number 61 - // (none) - - // Amplitude(s) for diagram number 61 - FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - jamp_sv[21] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 62 OF 123 *** - - // Wavefunction(s) for diagram number 62 - // (none) - - // Amplitude(s) for diagram number 62 - FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 63 OF 123 *** - - // Wavefunction(s) for diagram number 63 - // (none) - - // Amplitude(s) for diagram number 63 - FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += amp_sv[0]; - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 64 OF 123 *** - - // Wavefunction(s) for diagram number 64 - // (none) - - // Amplitude(s) for diagram number 64 - FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 65 OF 123 *** - - // Wavefunction(s) for diagram number 65 - VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); - FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 65 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 66 OF 123 *** - - // Wavefunction(s) for diagram number 66 - VVV1P0_1( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 66 - FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[9] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 67 OF 123 *** - - // Wavefunction(s) for diagram number 67 - // (none) - - // Amplitude(s) for diagram number 67 - FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 68 OF 123 *** - - // Wavefunction(s) for diagram number 68 - FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 68 - FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 69 OF 123 *** - - // Wavefunction(s) for diagram number 69 - // (none) - - // Amplitude(s) for diagram number 69 - FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 70 OF 123 *** - - // Wavefunction(s) for diagram number 70 - // (none) - - // Amplitude(s) for diagram number 70 - FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 71 OF 123 *** - - // Wavefunction(s) for diagram number 71 - // (none) - - // Amplitude(s) for diagram number 71 - FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - - // *** DIAGRAM 72 OF 123 *** - - // Wavefunction(s) for diagram number 72 - // (none) - - // Amplitude(s) for diagram number 72 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - - // *** DIAGRAM 73 OF 123 *** - - // Wavefunction(s) for diagram number 73 - // (none) - - // Amplitude(s) for diagram number 73 - VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 74 OF 123 *** - - // Wavefunction(s) for diagram number 74 - // (none) - - // Amplitude(s) for diagram number 74 - VVVV1_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 75 OF 123 *** - - // Wavefunction(s) for diagram number 75 - VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 75 - VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 76 OF 123 *** - - // Wavefunction(s) for diagram number 76 - // (none) - - // Amplitude(s) for diagram number 76 - VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 77 OF 123 *** - - // Wavefunction(s) for diagram number 77 - // (none) - - // Amplitude(s) for diagram number 77 - FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - jamp_sv[15] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 78 OF 123 *** - - // Wavefunction(s) for diagram number 78 - // (none) - - // Amplitude(s) for diagram number 78 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 79 OF 123 *** - - // Wavefunction(s) for diagram number 79 - // (none) - - // Amplitude(s) for diagram number 79 - FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 80 OF 123 *** - - // Wavefunction(s) for diagram number 80 - // (none) - - // Amplitude(s) for diagram number 80 - FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 81 OF 123 *** - - // Wavefunction(s) for diagram number 81 - FFV1_1( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); - - // Amplitude(s) for diagram number 81 - FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= amp_sv[0]; - - // *** DIAGRAM 82 OF 123 *** - - // Wavefunction(s) for diagram number 82 - FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 82 - FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= amp_sv[0]; - - // *** DIAGRAM 83 OF 123 *** - - // Wavefunction(s) for diagram number 83 - // (none) - - // Amplitude(s) for diagram number 83 - FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= amp_sv[0]; - - // *** DIAGRAM 84 OF 123 *** - - // Wavefunction(s) for diagram number 84 - FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); - - // Amplitude(s) for diagram number 84 - FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= amp_sv[0]; - - // *** DIAGRAM 85 OF 123 *** - - // Wavefunction(s) for diagram number 85 - // (none) - - // Amplitude(s) for diagram number 85 - FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 86 OF 123 *** - - // Wavefunction(s) for diagram number 86 - VVV1P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] ); - - // Amplitude(s) for diagram number 86 - FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - - // *** DIAGRAM 87 OF 123 *** - - // Wavefunction(s) for diagram number 87 - FFV1_2( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); - - // Amplitude(s) for diagram number 87 - FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] -= amp_sv[0]; - - // *** DIAGRAM 88 OF 123 *** - - // Wavefunction(s) for diagram number 88 - FFV1_1( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); - - // Amplitude(s) for diagram number 88 - FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[13] -= amp_sv[0]; - - // *** DIAGRAM 89 OF 123 *** - - // Wavefunction(s) for diagram number 89 - // (none) - - // Amplitude(s) for diagram number 89 - FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 90 OF 123 *** - - // Wavefunction(s) for diagram number 90 - FFV1_1( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); - - // Amplitude(s) for diagram number 90 - FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[19] -= amp_sv[0]; - - // *** DIAGRAM 91 OF 123 *** - - // Wavefunction(s) for diagram number 91 - // (none) - - // Amplitude(s) for diagram number 91 - FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 92 OF 123 *** - - // Wavefunction(s) for diagram number 92 - // (none) - - // Amplitude(s) for diagram number 92 - FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - - // *** DIAGRAM 93 OF 123 *** - - // Wavefunction(s) for diagram number 93 - // (none) - - // Amplitude(s) for diagram number 93 - VVVV1_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 94 OF 123 *** - - // Wavefunction(s) for diagram number 94 - VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] ); - - // Amplitude(s) for diagram number 94 - VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 95 OF 123 *** - - // Wavefunction(s) for diagram number 95 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] ); - - // Amplitude(s) for diagram number 95 - VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 96 OF 123 *** - - // Wavefunction(s) for diagram number 96 - // (none) - - // Amplitude(s) for diagram number 96 - FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - - // *** DIAGRAM 97 OF 123 *** - - // Wavefunction(s) for diagram number 97 - // (none) - - // Amplitude(s) for diagram number 97 - FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 98 OF 123 *** - - // Wavefunction(s) for diagram number 98 - // (none) - - // Amplitude(s) for diagram number 98 - FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - - // *** DIAGRAM 99 OF 123 *** - - // Wavefunction(s) for diagram number 99 - // (none) - - // Amplitude(s) for diagram number 99 - FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 100 OF 123 *** - - // Wavefunction(s) for diagram number 100 - // (none) - - // Amplitude(s) for diagram number 100 - VVVV1_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 101 OF 123 *** - - // Wavefunction(s) for diagram number 101 - VVV1P0_1( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 101 - VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 102 OF 123 *** - - // Wavefunction(s) for diagram number 102 - // (none) - - // Amplitude(s) for diagram number 102 - VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 103 OF 123 *** - - // Wavefunction(s) for diagram number 103 - // (none) - - // Amplitude(s) for diagram number 103 - FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - - // *** DIAGRAM 104 OF 123 *** - - // Wavefunction(s) for diagram number 104 - // (none) - - // Amplitude(s) for diagram number 104 - FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 105 OF 123 *** - - // Wavefunction(s) for diagram number 105 - // (none) - - // Amplitude(s) for diagram number 105 - FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - - // *** DIAGRAM 106 OF 123 *** - - // Wavefunction(s) for diagram number 106 - // (none) - - // Amplitude(s) for diagram number 106 - FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 107 OF 123 *** - - // Wavefunction(s) for diagram number 107 - // (none) - - // Amplitude(s) for diagram number 107 - VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 108 OF 123 *** - - // Wavefunction(s) for diagram number 108 - // (none) - - // Amplitude(s) for diagram number 108 - VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 109 OF 123 *** - - // Wavefunction(s) for diagram number 109 - // (none) - - // Amplitude(s) for diagram number 109 - VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 110 OF 123 *** - - // Wavefunction(s) for diagram number 110 - // (none) - - // Amplitude(s) for diagram number 110 - FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[12] -= amp_sv[0]; - - // *** DIAGRAM 111 OF 123 *** - - // Wavefunction(s) for diagram number 111 - // (none) - - // Amplitude(s) for diagram number 111 - FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 112 OF 123 *** - - // Wavefunction(s) for diagram number 112 - // (none) - - // Amplitude(s) for diagram number 112 - FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[18] -= amp_sv[0]; - - // *** DIAGRAM 113 OF 123 *** - - // Wavefunction(s) for diagram number 113 - // (none) - - // Amplitude(s) for diagram number 113 - FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 114 OF 123 *** - - // Wavefunction(s) for diagram number 114 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); - - // Amplitude(s) for diagram number 114 - VVV1_0( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 115 OF 123 *** - - // Wavefunction(s) for diagram number 115 - // (none) - - // Amplitude(s) for diagram number 115 - FFV1_0( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] += amp_sv[0]; - jamp_sv[19] -= amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[23] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[19] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[21] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[18] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - jamp_sv[23] -= amp_sv[0]; - - // *** DIAGRAM 116 OF 123 *** - - // Wavefunction(s) for diagram number 116 - // (none) - - // Amplitude(s) for diagram number 116 - FFV1_0( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - jamp_sv[2] -= amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - FFV1_0( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[2] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[8] -= amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[0] -= amp_sv[0]; - jamp_sv[6] += amp_sv[0]; - jamp_sv[12] += amp_sv[0]; - jamp_sv[14] -= amp_sv[0]; - - // *** DIAGRAM 117 OF 123 *** - - // Wavefunction(s) for diagram number 117 - VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); - VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); - VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); - - // Amplitude(s) for diagram number 117 - VVV1_0( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 118 OF 123 *** - - // Wavefunction(s) for diagram number 118 - // (none) - - // Amplitude(s) for diagram number 118 - FFV1_0( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] += amp_sv[0]; - jamp_sv[13] -= amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[17] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[13] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[15] -= amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[12] -= amp_sv[0]; - jamp_sv[14] += amp_sv[0]; - jamp_sv[16] += amp_sv[0]; - jamp_sv[17] -= amp_sv[0]; - - // *** DIAGRAM 119 OF 123 *** - - // Wavefunction(s) for diagram number 119 - // (none) - - // Amplitude(s) for diagram number 119 - FFV1_0( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - jamp_sv[4] -= amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[20] += amp_sv[0]; - FFV1_0( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[4] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[10] -= amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - FFV1_0( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[1] -= amp_sv[0]; - jamp_sv[7] += amp_sv[0]; - jamp_sv[18] += amp_sv[0]; - jamp_sv[20] -= amp_sv[0]; - - // *** DIAGRAM 120 OF 123 *** - - // Wavefunction(s) for diagram number 120 - VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); - VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); - VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); - - // Amplitude(s) for diagram number 120 - FFV1_0( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] += amp_sv[0]; - jamp_sv[7] -= amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[11] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[7] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[9] -= amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - FFV1_0( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[6] -= amp_sv[0]; - jamp_sv[8] += amp_sv[0]; - jamp_sv[10] += amp_sv[0]; - jamp_sv[11] -= amp_sv[0]; - - // *** DIAGRAM 121 OF 123 *** - - // Wavefunction(s) for diagram number 121 - // (none) - - // Amplitude(s) for diagram number 121 - FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] += amp_sv[0]; - jamp_sv[5] -= amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[22] += amp_sv[0]; - FFV1_0( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[5] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[16] -= amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - FFV1_0( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); - jamp_sv[3] -= amp_sv[0]; - jamp_sv[13] += amp_sv[0]; - jamp_sv[19] += amp_sv[0]; - jamp_sv[22] -= amp_sv[0]; - - // *** DIAGRAM 122 OF 123 *** - - // Wavefunction(s) for diagram number 122 - // (none) - - // Amplitude(s) for diagram number 122 - VVV1_0( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 123 OF 123 *** - - // Wavefunction(s) for diagram number 123 - // (none) - - // Amplitude(s) for diagram number 123 - VVV1_0( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - VVV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gg_ttxgg()?) - - // The color denominators (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] - - // The color matrix (initialize all array elements, with ncolor=24) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, - { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, - { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, - { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, - { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, - { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, - { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, - { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, - { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, - { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, - { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, - { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, - { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, - { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, - { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, - { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, - { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, - { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, - { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, - { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, - { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, - { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, - { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, - { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 123 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -2718,7 +738,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -2753,6 +777,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -2795,6 +823,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -2897,26 +929,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -2924,25 +956,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -3080,20 +1316,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -3105,17 +1335,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -3141,93 +1374,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3269,7 +1472,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -3292,7 +1495,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -3301,25 +1504,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -3329,8 +1538,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -3346,11 +1557,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -3452,14 +1664,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index b6e3ba16d4..22c61c860f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 159; //static const int ncomb = 64; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f index 850bc73f22..1418b77839 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f index 7af9753fb7..60103eb65c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc new file mode 100644 index 0000000000..02db3d0204 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc @@ -0,0 +1,405 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24] + + // The color matrix (initialize all array elements, with ncolor=24) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 }, + { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 }, + { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 }, + { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 }, + { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 }, + { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 }, + { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 }, + { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 }, + { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 }, + { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 }, + { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 }, + { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 }, + { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 }, + { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 }, + { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 }, + { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 }, + { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 }, + { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 }, + { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 }, + { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 }, + { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 }, + { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 }, + { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, + { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagrams.h new file mode 100644 index 0000000000..08f07c1187 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagrams.h @@ -0,0 +1,4120 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 123 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + // Amplitude(s) for diagram number 1 + VVVV1_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 123 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 123 *** + // Wavefunction(s) for diagram number 3 + VVV1P0_1( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 123 *** + // Wavefunction(s) for diagram number 4 + VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 4 + VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 123 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 123 *** + // Wavefunction(s) for diagram number 6 + // (none) + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 123 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 123 *** + // Wavefunction(s) for diagram number 8 + FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 123 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 123 *** + // Wavefunction(s) for diagram number 10 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 123 *** + // Wavefunction(s) for diagram number 11 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 123 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 123 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 123 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 123 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 123 *** + // Wavefunction(s) for diagram number 16 + // (none) + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 123 *** + // Wavefunction(s) for diagram number 17 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1_1( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 123 *** + // Wavefunction(s) for diagram number 18 + FFV1_1( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 18 + FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 123 *** + // Wavefunction(s) for diagram number 19 + // (none) + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 123 *** + // Wavefunction(s) for diagram number 20 + VVV1P0_1( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 123 *** + // Wavefunction(s) for diagram number 21 + // (none) + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 123 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 123 *** + // Wavefunction(s) for diagram number 23 + VVV1P0_1( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] ); + // Amplitude(s) for diagram number 23 + VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 123 *** + // Wavefunction(s) for diagram number 24 + // (none) + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 123 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 123 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 123 *** + // Wavefunction(s) for diagram number 27 + // (none) + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 123 *** + // Wavefunction(s) for diagram number 28 + // (none) + // Amplitude(s) for diagram number 28 + FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 123 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 123 *** + // Wavefunction(s) for diagram number 30 + // (none) + // Amplitude(s) for diagram number 30 + FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 123 *** + // Wavefunction(s) for diagram number 31 + // (none) + // Amplitude(s) for diagram number 31 + VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 123 *** + // Wavefunction(s) for diagram number 32 + VVVV1P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] ); + VVVV3P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] ); + VVVV4P0_1( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 123 *** + // Wavefunction(s) for diagram number 33 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 123 *** + // Wavefunction(s) for diagram number 34 + FFV1_2( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 34 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 123 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 123 *** + // Wavefunction(s) for diagram number 36 + FFV1P0_3( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 36 + VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram37( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 37 OF 123 *** + // Wavefunction(s) for diagram number 37 + // (none) + // Amplitude(s) for diagram number 37 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram38( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 38 OF 123 *** + // Wavefunction(s) for diagram number 38 + // (none) + // Amplitude(s) for diagram number 38 + FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram39( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 39 OF 123 *** + // Wavefunction(s) for diagram number 39 + // (none) + // Amplitude(s) for diagram number 39 + VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram40( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 40 OF 123 *** + // Wavefunction(s) for diagram number 40 + // (none) + // Amplitude(s) for diagram number 40 + FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram41( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 41 OF 123 *** + // Wavefunction(s) for diagram number 41 + // (none) + // Amplitude(s) for diagram number 41 + FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram42( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 42 OF 123 *** + // Wavefunction(s) for diagram number 42 + FFV1_2( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 42 + FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram43( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 43 OF 123 *** + // Wavefunction(s) for diagram number 43 + // (none) + // Amplitude(s) for diagram number 43 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram44( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 44 OF 123 *** + // Wavefunction(s) for diagram number 44 + // (none) + // Amplitude(s) for diagram number 44 + FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram45( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 45 OF 123 *** + // Wavefunction(s) for diagram number 45 + // (none) + // Amplitude(s) for diagram number 45 + FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram46( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 46 OF 123 *** + // Wavefunction(s) for diagram number 46 + // (none) + // Amplitude(s) for diagram number 46 + FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram47( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 47 OF 123 *** + // Wavefunction(s) for diagram number 47 + // (none) + // Amplitude(s) for diagram number 47 + VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram48( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 48 OF 123 *** + // Wavefunction(s) for diagram number 48 + // (none) + // Amplitude(s) for diagram number 48 + FFV1_0( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram49( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 49 OF 123 *** + // Wavefunction(s) for diagram number 49 + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] ); + FFV1_2( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); + // Amplitude(s) for diagram number 49 + FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram50( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 50 OF 123 *** + // Wavefunction(s) for diagram number 50 + VVV1P0_1( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 50 + FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram51( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 51 OF 123 *** + // Wavefunction(s) for diagram number 51 + // (none) + // Amplitude(s) for diagram number 51 + FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram52( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 52 OF 123 *** + // Wavefunction(s) for diagram number 52 + FFV1_1( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 52 + FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram53( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 53 OF 123 *** + // Wavefunction(s) for diagram number 53 + // (none) + // Amplitude(s) for diagram number 53 + FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram54( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 54 OF 123 *** + // Wavefunction(s) for diagram number 54 + // (none) + // Amplitude(s) for diagram number 54 + FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram55( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 55 OF 123 *** + // Wavefunction(s) for diagram number 55 + // (none) + // Amplitude(s) for diagram number 55 + FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram56( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 56 OF 123 *** + // Wavefunction(s) for diagram number 56 + // (none) + // Amplitude(s) for diagram number 56 + FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram57( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 57 OF 123 *** + // Wavefunction(s) for diagram number 57 + // (none) + // Amplitude(s) for diagram number 57 + VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram58( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 58 OF 123 *** + // Wavefunction(s) for diagram number 58 + // (none) + // Amplitude(s) for diagram number 58 + VVVV1_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram59( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 59 OF 123 *** + // Wavefunction(s) for diagram number 59 + VVV1P0_1( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 59 + VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram60( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 60 OF 123 *** + // Wavefunction(s) for diagram number 60 + // (none) + // Amplitude(s) for diagram number 60 + VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram61( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 61 OF 123 *** + // Wavefunction(s) for diagram number 61 + // (none) + // Amplitude(s) for diagram number 61 + FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram62( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 62 OF 123 *** + // Wavefunction(s) for diagram number 62 + // (none) + // Amplitude(s) for diagram number 62 + FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram63( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 63 OF 123 *** + // Wavefunction(s) for diagram number 63 + // (none) + // Amplitude(s) for diagram number 63 + FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram64( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 64 OF 123 *** + // Wavefunction(s) for diagram number 64 + // (none) + // Amplitude(s) for diagram number 64 + FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram65( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 65 OF 123 *** + // Wavefunction(s) for diagram number 65 + VVV1P0_1( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] ); + FFV1_2( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 65 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram66( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 66 OF 123 *** + // Wavefunction(s) for diagram number 66 + VVV1P0_1( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 66 + FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram67( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 67 OF 123 *** + // Wavefunction(s) for diagram number 67 + // (none) + // Amplitude(s) for diagram number 67 + FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram68( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 68 OF 123 *** + // Wavefunction(s) for diagram number 68 + FFV1_1( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 68 + FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram69( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 69 OF 123 *** + // Wavefunction(s) for diagram number 69 + // (none) + // Amplitude(s) for diagram number 69 + FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram70( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 70 OF 123 *** + // Wavefunction(s) for diagram number 70 + // (none) + // Amplitude(s) for diagram number 70 + FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram71( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 71 OF 123 *** + // Wavefunction(s) for diagram number 71 + // (none) + // Amplitude(s) for diagram number 71 + FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram72( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 72 OF 123 *** + // Wavefunction(s) for diagram number 72 + // (none) + // Amplitude(s) for diagram number 72 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram73( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 73 OF 123 *** + // Wavefunction(s) for diagram number 73 + // (none) + // Amplitude(s) for diagram number 73 + VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram74( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 74 OF 123 *** + // Wavefunction(s) for diagram number 74 + // (none) + // Amplitude(s) for diagram number 74 + VVVV1_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram75( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 75 OF 123 *** + // Wavefunction(s) for diagram number 75 + VVV1P0_1( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 75 + VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram76( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 76 OF 123 *** + // Wavefunction(s) for diagram number 76 + // (none) + // Amplitude(s) for diagram number 76 + VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram77( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 77 OF 123 *** + // Wavefunction(s) for diagram number 77 + // (none) + // Amplitude(s) for diagram number 77 + FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram78( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 78 OF 123 *** + // Wavefunction(s) for diagram number 78 + // (none) + // Amplitude(s) for diagram number 78 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram79( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 79 OF 123 *** + // Wavefunction(s) for diagram number 79 + // (none) + // Amplitude(s) for diagram number 79 + FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram80( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 80 OF 123 *** + // Wavefunction(s) for diagram number 80 + // (none) + // Amplitude(s) for diagram number 80 + FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram81( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 81 OF 123 *** + // Wavefunction(s) for diagram number 81 + FFV1_1( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] ); + // Amplitude(s) for diagram number 81 + FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram82( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 82 OF 123 *** + // Wavefunction(s) for diagram number 82 + FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 82 + FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram83( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 83 OF 123 *** + // Wavefunction(s) for diagram number 83 + // (none) + // Amplitude(s) for diagram number 83 + FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram84( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 84 OF 123 *** + // Wavefunction(s) for diagram number 84 + FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] ); + // Amplitude(s) for diagram number 84 + FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram85( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 85 OF 123 *** + // Wavefunction(s) for diagram number 85 + // (none) + // Amplitude(s) for diagram number 85 + FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram86( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 86 OF 123 *** + // Wavefunction(s) for diagram number 86 + VVV1P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] ); + // Amplitude(s) for diagram number 86 + FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram87( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 87 OF 123 *** + // Wavefunction(s) for diagram number 87 + FFV1_2( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] ); + // Amplitude(s) for diagram number 87 + FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram88( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 88 OF 123 *** + // Wavefunction(s) for diagram number 88 + FFV1_1( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] ); + // Amplitude(s) for diagram number 88 + FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram89( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 89 OF 123 *** + // Wavefunction(s) for diagram number 89 + // (none) + // Amplitude(s) for diagram number 89 + FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram90( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 90 OF 123 *** + // Wavefunction(s) for diagram number 90 + FFV1_1( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] ); + // Amplitude(s) for diagram number 90 + FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram91( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 91 OF 123 *** + // Wavefunction(s) for diagram number 91 + // (none) + // Amplitude(s) for diagram number 91 + FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram92( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 92 OF 123 *** + // Wavefunction(s) for diagram number 92 + // (none) + // Amplitude(s) for diagram number 92 + FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram93( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 93 OF 123 *** + // Wavefunction(s) for diagram number 93 + // (none) + // Amplitude(s) for diagram number 93 + VVVV1_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram94( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 94 OF 123 *** + // Wavefunction(s) for diagram number 94 + VVV1P0_1( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] ); + // Amplitude(s) for diagram number 94 + VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram95( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 95 OF 123 *** + // Wavefunction(s) for diagram number 95 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] ); + // Amplitude(s) for diagram number 95 + VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram96( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 96 OF 123 *** + // Wavefunction(s) for diagram number 96 + // (none) + // Amplitude(s) for diagram number 96 + FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram97( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 97 OF 123 *** + // Wavefunction(s) for diagram number 97 + // (none) + // Amplitude(s) for diagram number 97 + FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram98( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 98 OF 123 *** + // Wavefunction(s) for diagram number 98 + // (none) + // Amplitude(s) for diagram number 98 + FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram99( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 99 OF 123 *** + // Wavefunction(s) for diagram number 99 + // (none) + // Amplitude(s) for diagram number 99 + FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram100( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 100 OF 123 *** + // Wavefunction(s) for diagram number 100 + // (none) + // Amplitude(s) for diagram number 100 + VVVV1_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram101( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 101 OF 123 *** + // Wavefunction(s) for diagram number 101 + VVV1P0_1( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 101 + VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram102( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 102 OF 123 *** + // Wavefunction(s) for diagram number 102 + // (none) + // Amplitude(s) for diagram number 102 + VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram103( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 103 OF 123 *** + // Wavefunction(s) for diagram number 103 + // (none) + // Amplitude(s) for diagram number 103 + FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram104( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 104 OF 123 *** + // Wavefunction(s) for diagram number 104 + // (none) + // Amplitude(s) for diagram number 104 + FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram105( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 105 OF 123 *** + // Wavefunction(s) for diagram number 105 + // (none) + // Amplitude(s) for diagram number 105 + FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram106( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 106 OF 123 *** + // Wavefunction(s) for diagram number 106 + // (none) + // Amplitude(s) for diagram number 106 + FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram107( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 107 OF 123 *** + // Wavefunction(s) for diagram number 107 + // (none) + // Amplitude(s) for diagram number 107 + VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram108( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 108 OF 123 *** + // Wavefunction(s) for diagram number 108 + // (none) + // Amplitude(s) for diagram number 108 + VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram109( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 109 OF 123 *** + // Wavefunction(s) for diagram number 109 + // (none) + // Amplitude(s) for diagram number 109 + VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram110( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 110 OF 123 *** + // Wavefunction(s) for diagram number 110 + // (none) + // Amplitude(s) for diagram number 110 + FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram111( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 111 OF 123 *** + // Wavefunction(s) for diagram number 111 + // (none) + // Amplitude(s) for diagram number 111 + FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram112( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 112 OF 123 *** + // Wavefunction(s) for diagram number 112 + // (none) + // Amplitude(s) for diagram number 112 + FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram113( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 113 OF 123 *** + // Wavefunction(s) for diagram number 113 + // (none) + // Amplitude(s) for diagram number 113 + FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram114( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 114 OF 123 *** + // Wavefunction(s) for diagram number 114 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] ); + // Amplitude(s) for diagram number 114 + VVV1_0( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram115( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 115 OF 123 *** + // Wavefunction(s) for diagram number 115 + // (none) + // Amplitude(s) for diagram number 115 + FFV1_0( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram116( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 116 OF 123 *** + // Wavefunction(s) for diagram number 116 + // (none) + // Amplitude(s) for diagram number 116 + FFV1_0( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + FFV1_0( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram117( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 117 OF 123 *** + // Wavefunction(s) for diagram number 117 + VVVV1P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] ); + VVVV3P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); + VVVV4P0_1( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); + // Amplitude(s) for diagram number 117 + VVV1_0( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram118( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 118 OF 123 *** + // Wavefunction(s) for diagram number 118 + // (none) + // Amplitude(s) for diagram number 118 + FFV1_0( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram119( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 119 OF 123 *** + // Wavefunction(s) for diagram number 119 + // (none) + // Amplitude(s) for diagram number 119 + FFV1_0( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0]; + FFV1_0( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + FFV1_0( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram120( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 120 OF 123 *** + // Wavefunction(s) for diagram number 120 + VVVV1P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] ); + VVVV3P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] ); + VVVV4P0_1( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] ); + // Amplitude(s) for diagram number 120 + FFV1_0( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + FFV1_0( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram121( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 121 OF 123 *** + // Wavefunction(s) for diagram number 121 + // (none) + // Amplitude(s) for diagram number 121 + FFV1_0( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0]; + FFV1_0( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + FFV1_0( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram122( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 122 OF 123 *** + // Wavefunction(s) for diagram number 122 + // (none) + // Amplitude(s) for diagram number 122 + VVV1_0( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram123( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 123 OF 123 *** + // Wavefunction(s) for diagram number 123 + // (none) + // Amplitude(s) for diagram number 123 + VVV1_0( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + VVV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f index 39ecff768a..48a83737ca 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -398,7 +398,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(155) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -441,407 +442,81 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 1),I= 7, 12) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 1),I= 13, 18) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 1),I= 19, 24) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ + DATA DENOM/54/ + DATA (CF(I),I= 1, 24) /512,-128,-128,16,16,160,-128,16,16,-2,-2 + $ ,-20,16,-2,160,-20,142,124,-2,-20,-20,124,124,-56/ C 1 T(1,2,5,6,3,4) - DATA (CF(I, 2),I= 1, 6) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 2),I= 7, 12) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 2),I= 13, 18) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 2),I= 19, 24) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ + DATA (CF(I),I= 25, 47) /512,16,160,-128,16,16,-128,-2,-20,16,-2, + $ -2,-20,-20,124,124,-56,16,-2,160,-20,142,124/ C 1 T(1,2,6,5,3,4) - DATA (CF(I, 3),I= 1, 6) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 3),I= 7, 12) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 3),I= 13, 18) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 3),I= 19, 24) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ + DATA (CF(I),I= 48, 69) /512,-128,160,16,16,-2,160,-20,142,124, + $ -128,16,16,-2,-2,-20,-20,-2,124,-56,-20,124/ C 1 T(1,5,2,6,3,4) - DATA (CF(I, 4),I= 1, 6) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 4),I= 7, 12) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 4),I= 13, 18) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 4),I= 19, 24) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ + DATA (CF(I),I= 70, 90) /512,16,-128,-2,-20,-20,124,124,-56,16, + $ -128,-2,-20,16,-2,-2,16,142,124,160,-20/ C 1 T(1,5,6,2,3,4) - DATA (CF(I, 5),I= 1, 6) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 5),I= 7, 12) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 5),I= 13, 18) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 5),I= 19, 24) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ + DATA (CF(I),I= 91,110) /512,-128,-2,16,142,124,160,-20,-20,-2 + $ ,124,-56,-20,124,-128,16,16,-2,-2,-20/ C 1 T(1,6,2,5,3,4) - DATA (CF(I, 6),I= 1, 6) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 6),I= 7, 12) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 6),I= 13, 18) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 6),I= 19, 24) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=111,129) /512,-20,-2,124,-56,-20,124,-2,16,142,124 + $ ,160,-20,16,-128,-2,-20,16,-2/ C 1 T(1,6,5,2,3,4) - DATA (CF(I, 7),I= 1, 6) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 7),I= 7, 12) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 7),I= 13, 18) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 7),I= 19, 24) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ + DATA (CF(I),I=130,147) /512,-128,-128,16,16,160,160,-20,16,-2 + $ ,124,142,-20,124,-2,-20,-56,124/ C 1 T(2,1,5,6,3,4) - DATA (CF(I, 8),I= 1, 6) /1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 8),I= 7, 12) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 8),I= 13, 18) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 8),I= 19, 24) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ + DATA (CF(I),I=148,164) /512,16,160,-128,16,-20,124,-2,-20,-56 + $ ,124,160,-20,16,-2,124,142/ C 1 T(2,1,6,5,3,4) - DATA (CF(I, 9),I= 1, 6) /1.481481481481481D-01, - $ -1.851851851851852D-02,1.481481481481481D+00, - $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D - $ +00/ - DATA (CF(I, 9),I= 7, 12) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 9),I= 13, 18) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 9),I= 19, 24) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ + DATA (CF(I),I=165,180) /512,-128,160,16,16,-2,-128,16,-20,-2,124 + $ ,-56,-20,-2,124,-20/ C 1 T(2,5,1,6,3,4) - DATA (CF(I, 10),I= 1, 6) /-1.851851851851852D-02, - $ -1.851851851851852D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -5.185185185185185D-01/ - DATA (CF(I, 10),I= 7, 12) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 10),I= 13, 18) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 10),I= 19, 24) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ + DATA (CF(I),I=181,195) /512,16,-128,-2,-20,16,-128,-2,16,142,124 + $ ,-2,16,-20,160/ C 1 T(2,5,6,1,3,4) - DATA (CF(I, 11),I= 1, 6) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D - $ +00,1.481481481481481D+00,-1.851851851851852D-01/ - DATA (CF(I, 11),I= 7, 12) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 11),I= 13, 18) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 11),I= 19, 24) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=196,209) /512,-128,124,-56,-20,-2,124,-20,16,-2, + $ -128,16,-20,-2/ C 1 T(2,6,1,5,3,4) - DATA (CF(I, 12),I= 1, 6) /-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 12),I= 7, 12) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 12),I= 13, 18) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 12),I= 19, 24) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=210,222) /512,142,124,-2,16,-20,160,-2,-20,16,-128 + $ ,-2,16/ C 1 T(2,6,5,1,3,4) - DATA (CF(I, 13),I= 1, 6) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 13),I= 7, 12) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 13),I= 13, 18) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ - DATA (CF(I, 13),I= 19, 24) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ + DATA (CF(I),I=223,234) /512,-128,-128,16,16,160,124,-20,-56,124, + $ -2,-20/ C 1 T(5,1,2,6,3,4) - DATA (CF(I, 14),I= 1, 6) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 14),I= 7, 12) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 14),I= 13, 18) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ - DATA (CF(I, 14),I= 19, 24) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ + DATA (CF(I),I=235,245) /512,16,160,-128,16,-20,160,124,142,16,-2/ C 1 T(5,1,6,2,3,4) - DATA (CF(I, 15),I= 1, 6) /1.481481481481481D+00, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D - $ +00/ - DATA (CF(I, 15),I= 7, 12) /1.481481481481481D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 15),I= 13, 18) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ - DATA (CF(I, 15),I= 19, 24) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ + DATA (CF(I),I=246,255) /512,-128,160,16,-56,124,124,-20,-20,-2/ C 1 T(5,2,1,6,3,4) - DATA (CF(I, 16),I= 1, 6) /-1.851851851851852D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00/ - DATA (CF(I, 16),I= 7, 12) /-1.851851851851852D-02, - $ -1.851851851851852D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 16),I= 13, 18) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 16),I= 19, 24) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=256,264) /512,16,-128,124,142,-20,160,-2,16/ C 1 T(5,2,6,1,3,4) - DATA (CF(I, 17),I= 1, 6) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 17),I= 7, 12) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 17),I= 13, 18) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ - DATA (CF(I, 17),I= 19, 24) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ + DATA (CF(I),I=265,272) /512,-128,-2,16,-20,-2,-128,16/ C 1 T(5,6,1,2,3,4) - DATA (CF(I, 18),I= 1, 6) /1.148148148148148D+00, - $ -5.185185185185185D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,1.148148148148148D+00, - $ -1.851851851851852D-01/ - DATA (CF(I, 18),I= 7, 12) /1.314814814814815D+00 - $ ,1.148148148148148D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01 - $ ,1.481481481481481D+00/ - DATA (CF(I, 18),I= 13, 18) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ - DATA (CF(I, 18),I= 19, 24) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ + DATA (CF(I),I=273,279) /512,-20,-2,-2,16,16,-128/ C 1 T(5,6,2,1,3,4) - DATA (CF(I, 19),I= 1, 6) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 19),I= 7, 12) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 19),I= 13, 18) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 19),I= 19, 24) /9.481481481481481D+00, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D - $ +00/ + DATA (CF(I),I=280,285) /512,-128,-128,16,16,160/ C 1 T(6,1,2,5,3,4) - DATA (CF(I, 20),I= 1, 6) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 20),I= 7, 12) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 20),I= 13, 18) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 20),I= 19, 24) /-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D - $ +00,-1.185185185185185D+00,1.481481481481481D-01/ + DATA (CF(I),I=286,290) /512,16,160,-128,16/ C 1 T(6,1,5,2,3,4) - DATA (CF(I, 21),I= 1, 6) /-1.851851851851852D-01 - $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D - $ +00,1.481481481481481D-01,-1.851851851851852D-02/ - DATA (CF(I, 21),I= 7, 12) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 21),I= 13, 18) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 21),I= 19, 24) /-1.185185185185185D+00 - $ ,1.481481481481481D-01,9.481481481481481D+00, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01/ + DATA (CF(I),I=291,294) /512,-128,160,16/ C 1 T(6,2,1,5,3,4) - DATA (CF(I, 22),I= 1, 6) /1.148148148148148D+00, - $ -1.851851851851852D-01,-5.185185185185185D-01 - $ ,1.148148148148148D+00,-1.851851851851852D-02, - $ -1.851851851851852D-01/ - DATA (CF(I, 22),I= 7, 12) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 22),I= 13, 18) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 22),I= 19, 24) /1.481481481481481D-01 - $ ,1.481481481481481D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00,1.481481481481481D-01, - $ -1.185185185185185D+00/ + DATA (CF(I),I=295,297) /512,16,-128/ C 1 T(6,2,5,1,3,4) - DATA (CF(I, 23),I= 1, 6) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 23),I= 7, 12) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 23),I= 13, 18) /-1.851851851851852D-02 - $ ,1.481481481481481D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.185185185185185D+00 - $ ,1.481481481481481D-01/ - DATA (CF(I, 23),I= 19, 24) /1.481481481481481D-01, - $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D - $ -01,9.481481481481481D+00,-1.185185185185185D+00/ + DATA (CF(I),I=298,299) /512,-128/ C 1 T(6,5,1,2,3,4) - DATA (CF(I, 24),I= 1, 6) /-5.185185185185185D-01 - $ ,1.148148148148148D+00,1.148148148148148D+00, - $ -1.851851851851852D-01,-1.851851851851852D-01, - $ -1.851851851851852D-02/ - DATA (CF(I, 24),I= 7, 12) /1.148148148148148D+00 - $ ,1.314814814814815D+00,-1.851851851851852D-01 - $ ,1.481481481481481D+00,-1.851851851851852D-02 - $ ,1.481481481481481D-01/ - DATA (CF(I, 24),I= 13, 18) /-1.851851851851852D-01, - $ -1.851851851851852D-02,-1.851851851851852D-02 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00/ - DATA (CF(I, 24),I= 19, 24) /1.481481481481481D+00 - $ ,1.481481481481481D-01,1.481481481481481D-01, - $ -1.185185185185185D+00,-1.185185185185185D+00 - $ ,9.481481481481481D+00/ + DATA (CF(I),I=300,300) /512/ C 1 T(6,5,2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -1547,10 +1222,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -1559,6 +1236,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(2)=AMP2(2)+AMP(4)*DCONJG(AMP(4)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index 70d0f7cb8e..da1b425ff0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,804 +282,205 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 36 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 36 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 36 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 36 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 36 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 36 *** - - // Wavefunction(s) for diagram number 6 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[5] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 7 OF 36 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[10] ); - FFV1P0_3( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[5] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 8 OF 36 *** - - // Wavefunction(s) for diagram number 8 - FFV1_2( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 9 OF 36 *** - - // Wavefunction(s) for diagram number 9 - FFV1_1( w_fp[9], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[1] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 10 OF 36 *** - - // Wavefunction(s) for diagram number 10 - // (none) - - // Amplitude(s) for diagram number 10 - VVV1_0( w_fp[1], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 36 *** - - // Wavefunction(s) for diagram number 11 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 12 OF 36 *** - - // Wavefunction(s) for diagram number 12 - FFV1P0_3( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 13 OF 36 *** - - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[12], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[6] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 14 OF 36 *** - - // Wavefunction(s) for diagram number 14 - FFV1_2( w_fp[11], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 15 OF 36 *** - - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 16 OF 36 *** - - // Wavefunction(s) for diagram number 16 - FFV1_1( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 17 OF 36 *** - - // Wavefunction(s) for diagram number 17 - // (none) - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 18 OF 36 *** - - // Wavefunction(s) for diagram number 18 - // (none) - - // Amplitude(s) for diagram number 18 - FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - jamp_sv[6] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 19 OF 36 *** - - // Wavefunction(s) for diagram number 19 - FFV1_1( w_fp[9], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 20 OF 36 *** - - // Wavefunction(s) for diagram number 20 - // (none) - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 21 OF 36 *** - - // Wavefunction(s) for diagram number 21 - FFV1_2( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] ); - FFV1P0_3( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 22 OF 36 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 23 OF 36 *** - - // Wavefunction(s) for diagram number 23 - // (none) - - // Amplitude(s) for diagram number 23 - FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 24 OF 36 *** - - // Wavefunction(s) for diagram number 24 - FFV1_2( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 25 OF 36 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 26 OF 36 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 27 OF 36 *** - - // Wavefunction(s) for diagram number 27 - VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 28 OF 36 *** - - // Wavefunction(s) for diagram number 28 - FFV1_2( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 28 - FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[2] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 29 OF 36 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 30 OF 36 *** - - // Wavefunction(s) for diagram number 30 - FFV1_1( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 30 - FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 31 OF 36 *** - - // Wavefunction(s) for diagram number 31 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 31 - FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 32 OF 36 *** - - // Wavefunction(s) for diagram number 32 - FFV1_2( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 33 OF 36 *** - - // Wavefunction(s) for diagram number 33 - // (none) - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[12], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 34 OF 36 *** - - // Wavefunction(s) for diagram number 34 - // (none) - - // Amplitude(s) for diagram number 34 - VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 35 OF 36 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - VVV1_0( w_fp[1], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 36 OF 36 *** - - // Wavefunction(s) for diagram number 36 - // (none) - - // Amplitude(s) for diagram number 36 - VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gg_ttxuux()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, - { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, - { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, - { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, - { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, - { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, - { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, - { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, - { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, - { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, - { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, - { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 36 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -1113,7 +567,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1148,6 +606,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1190,6 +652,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1292,26 +758,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1319,25 +785,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1475,20 +1145,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1500,17 +1164,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1536,93 +1203,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1664,7 +1301,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1687,7 +1324,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1696,25 +1333,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1724,8 +1367,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1741,11 +1386,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1847,14 +1493,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index 84a8066974..35a0f978a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 38; //static const int ncomb = 64; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f index 49cac7230f..ef6c2d98a2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f index 6e1c3f774f..44b8eb0a9b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -305,6 +305,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -388,12 +392,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -517,6 +521,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc new file mode 100644 index 0000000000..065151d9f1 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc @@ -0,0 +1,393 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, + { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, + { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, + { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, + { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, + { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, + { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, + { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, + { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, + { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, + { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, + { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagrams.h new file mode 100644 index 0000000000..321eb5303f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagrams.h @@ -0,0 +1,1132 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 36 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 36 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 36 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 36 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 36 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 36 *** + // Wavefunction(s) for diagram number 6 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 36 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[10] ); + FFV1P0_3( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 36 *** + // Wavefunction(s) for diagram number 8 + FFV1_2( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 36 *** + // Wavefunction(s) for diagram number 9 + FFV1_1( w_fp[9], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 36 *** + // Wavefunction(s) for diagram number 10 + // (none) + // Amplitude(s) for diagram number 10 + VVV1_0( w_fp[1], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 36 *** + // Wavefunction(s) for diagram number 11 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 36 *** + // Wavefunction(s) for diagram number 12 + FFV1P0_3( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 36 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[12], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 36 *** + // Wavefunction(s) for diagram number 14 + FFV1_2( w_fp[11], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 36 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 36 *** + // Wavefunction(s) for diagram number 16 + FFV1_1( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 36 *** + // Wavefunction(s) for diagram number 17 + // (none) + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 36 *** + // Wavefunction(s) for diagram number 18 + // (none) + // Amplitude(s) for diagram number 18 + FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 36 *** + // Wavefunction(s) for diagram number 19 + FFV1_1( w_fp[9], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 36 *** + // Wavefunction(s) for diagram number 20 + // (none) + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 36 *** + // Wavefunction(s) for diagram number 21 + FFV1_2( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] ); + FFV1P0_3( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 36 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 36 *** + // Wavefunction(s) for diagram number 23 + // (none) + // Amplitude(s) for diagram number 23 + FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 36 *** + // Wavefunction(s) for diagram number 24 + FFV1_2( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 36 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 36 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 36 *** + // Wavefunction(s) for diagram number 27 + VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 36 *** + // Wavefunction(s) for diagram number 28 + FFV1_2( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 28 + FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 36 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 36 *** + // Wavefunction(s) for diagram number 30 + FFV1_1( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 30 + FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 36 *** + // Wavefunction(s) for diagram number 31 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 31 + FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 36 *** + // Wavefunction(s) for diagram number 32 + FFV1_2( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 36 *** + // Wavefunction(s) for diagram number 33 + // (none) + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[12], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 36 *** + // Wavefunction(s) for diagram number 34 + // (none) + // Amplitude(s) for diagram number 34 + VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 36 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + VVV1_0( w_fp[1], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 36 *** + // Wavefunction(s) for diagram number 36 + // (none) + // Amplitude(s) for diagram number 36 + VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f index 9fb8f4d180..ffaa3cde67 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(17) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,111 +448,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,0.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 1),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,32,32,12,0,32,-4,0,-12,-4,-4,12/ C 1 T(1,2,3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /5.333333333333333D+00 - $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,0.000000000000000D+00/ - DATA (CF(I, 2),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,-4,-12,12,-4/ C 1 T(1,2,3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /5.333333333333333D+00 - $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ - DATA (CF(I, 3),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,12,-12,-4/ C 1 T(1,2,5,4) T(3,6) - DATA (CF(I, 4),I= 1, 6) /2.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,0.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 4),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00/ + DATA (CF(I),I= 34, 42) /48,0,-4,32,0,12,-4,-4,-12/ C 1 T(1,2,5,6) T(3,4) - DATA (CF(I, 5),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00/ - DATA (CF(I, 5),I= 7, 12) /5.333333333333333D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 43, 50) /48,32,32,12,0,-4,32,0/ C 1 T(1,3,4) T(2,5,6) - DATA (CF(I, 6),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01/ - DATA (CF(I, 6),I= 7, 12) /2.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 51, 57) /48,12,32,-4,0,0,32/ C 1 T(1,3,6) T(2,5,4) - DATA (CF(I, 7),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 58, 63) /48,32,32,0,0,-4/ C 1 T(1,5,4) T(2,3,6) - DATA (CF(I, 8),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,2.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 8),I= 7, 12) /5.333333333333333D+00 - $ ,1.600000000000000D+01,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 64, 68) /48,0,32,-4,0/ C 1 T(1,5,6) T(2,3,4) - DATA (CF(I, 9),I= 1, 6) /-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 9),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 69, 72) /48,32,32,12/ C 1 T(2,1,3,4) T(5,6) - DATA (CF(I, 10),I= 1, 6) /-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,0.000000000000000D+00/ - DATA (CF(I, 10),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 73, 75) /48,12,32/ C 1 T(2,1,3,6) T(5,4) - DATA (CF(I, 11),I= 1, 6) /-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ - DATA (CF(I, 11),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(2,1,5,4) T(3,6) - DATA (CF(I, 12),I= 1, 6) /2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ - DATA (CF(I, 12),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(2,1,5,6) T(3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -760,10 +694,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -772,6 +708,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index ac4bf091b7..421e3e13fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,804 +282,205 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 36 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[1], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 36 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 36 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[5] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 36 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 36 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 36 *** - - // Wavefunction(s) for diagram number 6 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 7 OF 36 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); - FFV1P0_3( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[1], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 8 OF 36 *** - - // Wavefunction(s) for diagram number 8 - FFV1_2( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[12], w_fp[5], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 9 OF 36 *** - - // Wavefunction(s) for diagram number 9 - FFV1_1( w_fp[9], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 10 OF 36 *** - - // Wavefunction(s) for diagram number 10 - // (none) - - // Amplitude(s) for diagram number 10 - VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 36 *** - - // Wavefunction(s) for diagram number 11 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 12 OF 36 *** - - // Wavefunction(s) for diagram number 12 - FFV1P0_3( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[1], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 13 OF 36 *** - - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[12], w_fp[5], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 14 OF 36 *** - - // Wavefunction(s) for diagram number 14 - FFV1_2( w_fp[11], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 15 OF 36 *** - - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 16 OF 36 *** - - // Wavefunction(s) for diagram number 16 - FFV1_1( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 17 OF 36 *** - - // Wavefunction(s) for diagram number 17 - // (none) - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 18 OF 36 *** - - // Wavefunction(s) for diagram number 18 - // (none) - - // Amplitude(s) for diagram number 18 - FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 19 OF 36 *** - - // Wavefunction(s) for diagram number 19 - FFV1_1( w_fp[9], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[1], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += 1. / 6. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 20 OF 36 *** - - // Wavefunction(s) for diagram number 20 - // (none) - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 21 OF 36 *** - - // Wavefunction(s) for diagram number 21 - FFV1_2( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] ); - FFV1P0_3( w_fp[14], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 6. * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 22 OF 36 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 23 OF 36 *** - - // Wavefunction(s) for diagram number 23 - // (none) - - // Amplitude(s) for diagram number 23 - FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 24 OF 36 *** - - // Wavefunction(s) for diagram number 24 - FFV1_2( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[9], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += 1. / 6. * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 25 OF 36 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 26 OF 36 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 27 OF 36 *** - - // Wavefunction(s) for diagram number 27 - VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 28 OF 36 *** - - // Wavefunction(s) for diagram number 28 - FFV1_2( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 28 - FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 6. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 29 OF 36 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 30 OF 36 *** - - // Wavefunction(s) for diagram number 30 - FFV1_1( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 30 - FFV1_0( w_fp[1], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 31 OF 36 *** - - // Wavefunction(s) for diagram number 31 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 31 - FFV1_0( w_fp[1], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 32 OF 36 *** - - // Wavefunction(s) for diagram number 32 - FFV1_2( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 33 OF 36 *** - - // Wavefunction(s) for diagram number 33 - // (none) - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[12], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 34 OF 36 *** - - // Wavefunction(s) for diagram number 34 - // (none) - - // Amplitude(s) for diagram number 34 - VVVV1_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[5] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 35 OF 36 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 36 OF 36 *** - - // Wavefunction(s) for diagram number 36 - // (none) - - // Amplitude(s) for diagram number 36 - VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gu_ttxgu()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 0, 16, -2, 0, 16, 6, 0, 16, 0, -2 }, - { 16, 48, 16, 0, 0, -2, 6, 16, 16, 0, -2, 0 }, - { 0, 16, 48, 16, 16, 6, -2, 0, 6, -2, -6, -2 }, - { 16, 0, 16, 48, 6, 16, 0, -2, -2, 6, -2, -6 }, - { -2, 0, 16, 6, 48, 16, 0, 16, -2, -6, -2, 6 }, - { 0, -2, 6, 16, 16, 48, 16, 0, -6, -2, 6, -2 }, - { 16, 6, -2, 0, 0, 16, 48, 16, -2, 0, 16, 0 }, - { 6, 16, 0, -2, 16, 0, 16, 48, 0, -2, 0, 16 }, - { 0, 16, 6, -2, -2, -6, -2, 0, 48, 16, 6, 16 }, - { 16, 0, -2, 6, -6, -2, 0, -2, 16, 48, 16, 6 }, - { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 }, - { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 36 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -1113,7 +567,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1148,6 +606,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1190,6 +652,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1292,26 +758,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1319,25 +785,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1473,22 +1143,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1500,17 +1164,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1536,93 +1203,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1664,7 +1301,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1687,7 +1324,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1696,25 +1333,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1724,8 +1367,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1741,11 +1386,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1847,14 +1493,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index f75309f403..1a5a996480 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 38; //static const int ncomb = 64; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f index 47e378e255..eb269f804d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f index 756e98881c..3d6ffe6ba1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc new file mode 100644 index 0000000000..088e843da8 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc @@ -0,0 +1,393 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 0, 16, -2, 0, 16, 6, 0, 16, 0, -2 }, + { 16, 48, 16, 0, 0, -2, 6, 16, 16, 0, -2, 0 }, + { 0, 16, 48, 16, 16, 6, -2, 0, 6, -2, -6, -2 }, + { 16, 0, 16, 48, 6, 16, 0, -2, -2, 6, -2, -6 }, + { -2, 0, 16, 6, 48, 16, 0, 16, -2, -6, -2, 6 }, + { 0, -2, 6, 16, 16, 48, 16, 0, -6, -2, 6, -2 }, + { 16, 6, -2, 0, 0, 16, 48, 16, -2, 0, 16, 0 }, + { 6, 16, 0, -2, 16, 0, 16, 48, 0, -2, 0, 16 }, + { 0, 16, 6, -2, -2, -6, -2, 0, 48, 16, 6, 16 }, + { 16, 0, -2, 6, -6, -2, 0, -2, 16, 48, 16, 6 }, + { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 }, + { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagrams.h new file mode 100644 index 0000000000..8e0fc00307 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagrams.h @@ -0,0 +1,1132 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 36 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[1], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 36 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 36 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 36 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 36 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 36 *** + // Wavefunction(s) for diagram number 6 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 36 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); + FFV1P0_3( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[1], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 36 *** + // Wavefunction(s) for diagram number 8 + FFV1_2( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[12], w_fp[5], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 36 *** + // Wavefunction(s) for diagram number 9 + FFV1_1( w_fp[9], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 36 *** + // Wavefunction(s) for diagram number 10 + // (none) + // Amplitude(s) for diagram number 10 + VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 36 *** + // Wavefunction(s) for diagram number 11 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 36 *** + // Wavefunction(s) for diagram number 12 + FFV1P0_3( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[1], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 36 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[12], w_fp[5], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 36 *** + // Wavefunction(s) for diagram number 14 + FFV1_2( w_fp[11], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 36 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 36 *** + // Wavefunction(s) for diagram number 16 + FFV1_1( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 36 *** + // Wavefunction(s) for diagram number 17 + // (none) + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 36 *** + // Wavefunction(s) for diagram number 18 + // (none) + // Amplitude(s) for diagram number 18 + FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 36 *** + // Wavefunction(s) for diagram number 19 + FFV1_1( w_fp[9], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[1], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 36 *** + // Wavefunction(s) for diagram number 20 + // (none) + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 36 *** + // Wavefunction(s) for diagram number 21 + FFV1_2( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] ); + FFV1P0_3( w_fp[14], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 36 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 36 *** + // Wavefunction(s) for diagram number 23 + // (none) + // Amplitude(s) for diagram number 23 + FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 36 *** + // Wavefunction(s) for diagram number 24 + FFV1_2( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[9], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 36 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 36 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 36 *** + // Wavefunction(s) for diagram number 27 + VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 36 *** + // Wavefunction(s) for diagram number 28 + FFV1_2( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 28 + FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 36 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 36 *** + // Wavefunction(s) for diagram number 30 + FFV1_1( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 30 + FFV1_0( w_fp[1], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 36 *** + // Wavefunction(s) for diagram number 31 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 31 + FFV1_0( w_fp[1], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 36 *** + // Wavefunction(s) for diagram number 32 + FFV1_2( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 36 *** + // Wavefunction(s) for diagram number 33 + // (none) + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[12], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 36 *** + // Wavefunction(s) for diagram number 34 + // (none) + // Amplitude(s) for diagram number 34 + VVVV1_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 36 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 36 *** + // Wavefunction(s) for diagram number 36 + // (none) + // Amplitude(s) for diagram number 36 + VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f index 0079f40417..616ba2e46c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(17) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,111 +448,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01 - $ ,5.333333333333333D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ - DATA (CF(I, 1),I= 7, 12) /5.333333333333333D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,32,0,32,-4,0,32,12,0,32,0,-4/ C 1 T(1,3,2) T(5,6,4) - DATA (CF(I, 2),I= 1, 6) /5.333333333333333D+00 - $ ,1.600000000000000D+01,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 2),I= 7, 12) /2.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 13, 23) /48,32,0,0,-4,12,32,32,0,-4,0/ C 1 T(1,3,4) T(5,6,2) - DATA (CF(I, 3),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ - DATA (CF(I, 3),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 24, 33) /48,32,32,12,-4,0,12,-4,-12,-4/ C 1 T(1,5,3,2) T(6,4) - DATA (CF(I, 4),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 4),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-6.666666666666666D-01, - $ -2.000000000000000D+00/ + DATA (CF(I),I= 34, 42) /48,12,32,0,-4,-4,12,-4,-12/ C 1 T(1,5,3,4) T(6,2) - DATA (CF(I, 5),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ - DATA (CF(I, 5),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01, - $ -2.000000000000000D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ + DATA (CF(I),I= 43, 50) /48,32,0,32,-4,-12,-4,12/ C 1 T(1,5,6,2) T(3,4) - DATA (CF(I, 6),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ - DATA (CF(I, 6),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 51, 57) /48,32,0,-12,-4,12,-4/ C 1 T(1,5,6,4) T(3,2) - DATA (CF(I, 7),I= 1, 6) /5.333333333333333D+00 - $ ,2.000000000000000D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 58, 63) /48,32,-4,0,32,0/ C 1 T(1,6,2) T(5,3,4) - DATA (CF(I, 8),I= 1, 6) /2.000000000000000D+00 - $ ,5.333333333333333D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ - DATA (CF(I, 8),I= 7, 12) /5.333333333333333D+00 - $ ,1.600000000000000D+01,0.000000000000000D+00, - $ -6.666666666666666D-01,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 64, 68) /48,0,-4,0,32/ C 1 T(1,6,4) T(5,3,2) - DATA (CF(I, 9),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00/ - DATA (CF(I, 9),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 69, 72) /48,32,12,32/ C 1 T(3,2) T(5,1,6,4) - DATA (CF(I, 10),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 10),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 73, 75) /48,32,12/ C 1 T(3,4) T(5,1,6,2) - DATA (CF(I, 11),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ - DATA (CF(I, 11),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(5,1,3,2) T(6,4) - DATA (CF(I, 12),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 12),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(5,1,3,4) T(6,2) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -761,10 +695,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -773,6 +709,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index acf1b836af..afd9438ba0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,804 +282,205 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 36 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 36 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 36 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 36 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 36 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 36 *** - - // Wavefunction(s) for diagram number 6 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 7 OF 36 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); - FFV1P0_3( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 8 OF 36 *** - - // Wavefunction(s) for diagram number 8 - FFV1_2( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 9 OF 36 *** - - // Wavefunction(s) for diagram number 9 - FFV1_1( w_fp[9], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= 1. / 6. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 10 OF 36 *** - - // Wavefunction(s) for diagram number 10 - // (none) - - // Amplitude(s) for diagram number 10 - VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 36 *** - - // Wavefunction(s) for diagram number 11 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[9] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 12 OF 36 *** - - // Wavefunction(s) for diagram number 12 - FFV1P0_3( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - jamp_sv[11] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 13 OF 36 *** - - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 14 OF 36 *** - - // Wavefunction(s) for diagram number 14 - FFV1_2( w_fp[11], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] -= 1. / 6. * amp_sv[0]; - jamp_sv[11] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 15 OF 36 *** - - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 16 OF 36 *** - - // Wavefunction(s) for diagram number 16 - FFV1_1( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 17 OF 36 *** - - // Wavefunction(s) for diagram number 17 - // (none) - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 18 OF 36 *** - - // Wavefunction(s) for diagram number 18 - // (none) - - // Amplitude(s) for diagram number 18 - FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 19 OF 36 *** - - // Wavefunction(s) for diagram number 19 - FFV1_1( w_fp[9], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 20 OF 36 *** - - // Wavefunction(s) for diagram number 20 - // (none) - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 21 OF 36 *** - - // Wavefunction(s) for diagram number 21 - FFV1_2( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] ); - FFV1P0_3( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[8] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 22 OF 36 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 23 OF 36 *** - - // Wavefunction(s) for diagram number 23 - // (none) - - // Amplitude(s) for diagram number 23 - FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 2. * amp_sv[0]; - jamp_sv[10] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 24 OF 36 *** - - // Wavefunction(s) for diagram number 24 - FFV1_2( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += 1. / 2. * amp_sv[0]; - jamp_sv[10] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 25 OF 36 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 26 OF 36 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += 1. / 2. * amp_sv[0]; - jamp_sv[9] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 27 OF 36 *** - - // Wavefunction(s) for diagram number 27 - VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 28 OF 36 *** - - // Wavefunction(s) for diagram number 28 - FFV1_2( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 28 - FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 29 OF 36 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 30 OF 36 *** - - // Wavefunction(s) for diagram number 30 - FFV1_1( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 30 - FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= 1. / 6. * amp_sv[0]; - jamp_sv[11] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 31 OF 36 *** - - // Wavefunction(s) for diagram number 31 - VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 31 - FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 32 OF 36 *** - - // Wavefunction(s) for diagram number 32 - FFV1_2( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= 1. / 6. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 33 OF 36 *** - - // Wavefunction(s) for diagram number 33 - // (none) - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 34 OF 36 *** - - // Wavefunction(s) for diagram number 34 - // (none) - - // Amplitude(s) for diagram number 34 - VVVV1_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 2. * amp_sv[0]; - VVVV3_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 2. * amp_sv[0]; - VVVV4_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[8] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 35 OF 36 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 36 OF 36 *** - - // Wavefunction(s) for diagram number 36 - // (none) - - // Amplitude(s) for diagram number 36 - VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[8] += 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_gux_ttxgux()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 16, 6, 0, 16, -2, 0, 0, 16, -2, 0 }, - { 16, 48, 6, 16, 16, 0, 0, -2, 16, 0, 0, -2 }, - { 16, 6, 48, 16, -2, 0, 0, 16, -2, 0, 0, 16 }, - { 6, 16, 16, 48, 0, -2, 16, 0, 0, -2, 16, 0 }, - { 0, 16, -2, 0, 48, 16, 16, 6, 6, -2, -2, -6 }, - { 16, 0, 0, -2, 16, 48, 6, 16, -2, 6, -6, -2 }, - { -2, 0, 0, 16, 16, 6, 48, 16, -2, -6, 6, -2 }, - { 0, -2, 16, 0, 6, 16, 16, 48, -6, -2, -2, 6 }, - { 0, 16, -2, 0, 6, -2, -2, -6, 48, 16, 16, 6 }, - { 16, 0, 0, -2, -2, 6, -6, -2, 16, 48, 6, 16 }, - { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 }, - { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 36 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -1113,7 +567,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1148,6 +606,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1190,6 +652,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1292,26 +758,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1319,25 +785,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1473,22 +1143,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1500,17 +1164,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1536,93 +1203,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1664,7 +1301,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1687,7 +1324,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1696,25 +1333,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1724,8 +1367,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1741,11 +1386,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1847,14 +1493,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index 531d6bcd03..3324b8da0f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 38; //static const int ncomb = 64; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f index f13f023e7d..887e00cba5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f index a59705bfaf..5de24d634f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc new file mode 100644 index 0000000000..0bd6c47075 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc @@ -0,0 +1,393 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 16, 6, 0, 16, -2, 0, 0, 16, -2, 0 }, + { 16, 48, 6, 16, 16, 0, 0, -2, 16, 0, 0, -2 }, + { 16, 6, 48, 16, -2, 0, 0, 16, -2, 0, 0, 16 }, + { 6, 16, 16, 48, 0, -2, 16, 0, 0, -2, 16, 0 }, + { 0, 16, -2, 0, 48, 16, 16, 6, 6, -2, -2, -6 }, + { 16, 0, 0, -2, 16, 48, 6, 16, -2, 6, -6, -2 }, + { -2, 0, 0, 16, 16, 6, 48, 16, -2, -6, 6, -2 }, + { 0, -2, 16, 0, 6, 16, 16, 48, -6, -2, -2, 6 }, + { 0, 16, -2, 0, 6, -2, -2, -6, 48, 16, 16, 6 }, + { 16, 0, 0, -2, -2, 6, -6, -2, 16, 48, 6, 16 }, + { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 }, + { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagrams.h new file mode 100644 index 0000000000..513029c15f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagrams.h @@ -0,0 +1,1132 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 36 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + VVV1P0_1( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 36 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 36 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 36 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 36 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 36 *** + // Wavefunction(s) for diagram number 6 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 36 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); + FFV1P0_3( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 36 *** + // Wavefunction(s) for diagram number 8 + FFV1_2( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 36 *** + // Wavefunction(s) for diagram number 9 + FFV1_1( w_fp[9], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 36 *** + // Wavefunction(s) for diagram number 10 + // (none) + // Amplitude(s) for diagram number 10 + VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 36 *** + // Wavefunction(s) for diagram number 11 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 36 *** + // Wavefunction(s) for diagram number 12 + FFV1P0_3( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 36 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 36 *** + // Wavefunction(s) for diagram number 14 + FFV1_2( w_fp[11], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 36 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 36 *** + // Wavefunction(s) for diagram number 16 + FFV1_1( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 36 *** + // Wavefunction(s) for diagram number 17 + // (none) + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 36 *** + // Wavefunction(s) for diagram number 18 + // (none) + // Amplitude(s) for diagram number 18 + FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 36 *** + // Wavefunction(s) for diagram number 19 + FFV1_1( w_fp[9], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 36 *** + // Wavefunction(s) for diagram number 20 + // (none) + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 36 *** + // Wavefunction(s) for diagram number 21 + FFV1_2( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] ); + FFV1P0_3( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 36 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 36 *** + // Wavefunction(s) for diagram number 23 + // (none) + // Amplitude(s) for diagram number 23 + FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 36 *** + // Wavefunction(s) for diagram number 24 + FFV1_2( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 36 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 36 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 36 *** + // Wavefunction(s) for diagram number 27 + VVV1P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 36 *** + // Wavefunction(s) for diagram number 28 + FFV1_2( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 28 + FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 36 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 36 *** + // Wavefunction(s) for diagram number 30 + FFV1_1( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 30 + FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 36 *** + // Wavefunction(s) for diagram number 31 + VVV1P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 31 + FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 36 *** + // Wavefunction(s) for diagram number 32 + FFV1_2( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 36 *** + // Wavefunction(s) for diagram number 33 + // (none) + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 36 *** + // Wavefunction(s) for diagram number 34 + // (none) + // Amplitude(s) for diagram number 34 + VVVV1_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0]; + VVVV3_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0]; + VVVV4_0( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 36 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 36 *** + // Wavefunction(s) for diagram number 36 + // (none) + // Amplitude(s) for diagram number 36 + VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f index 7cd8b962cc..db9ee54bf0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(17) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,109 +448,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,0.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 1),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,32,32,12,0,32,-4,0,0,32,-4,0/ C 1 T(1,2,4) T(5,3,6) - DATA (CF(I, 2),I= 1, 6) /5.333333333333333D+00 - $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,0.000000000000000D+00/ - DATA (CF(I, 2),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,32,0,0,-4/ C 1 T(1,2,6) T(5,3,4) - DATA (CF(I, 3),I= 1, 6) /5.333333333333333D+00 - $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ - DATA (CF(I, 3),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,0,0,32/ C 1 T(1,3,4) T(5,2,6) - DATA (CF(I, 4),I= 1, 6) /2.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,0.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 4),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 34, 42) /48,0,-4,32,0,0,-4,32,0/ C 1 T(1,3,6) T(5,2,4) - DATA (CF(I, 5),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00/ - DATA (CF(I, 5),I= 7, 12) /5.333333333333333D+00 - $ ,2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00/ + DATA (CF(I),I= 43, 50) /48,32,32,12,12,-4,-4,-12/ C 1 T(1,5,2,4) T(3,6) - DATA (CF(I, 6),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01/ - DATA (CF(I, 6),I= 7, 12) /2.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 51, 57) /48,12,32,-4,12,-12,-4/ C 1 T(1,5,2,6) T(3,4) - DATA (CF(I, 7),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01 - $ ,5.333333333333333D+00,-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 58, 63) /48,32,-4,-12,12,-4/ C 1 T(1,5,3,4) T(2,6) - DATA (CF(I, 8),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,2.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 8),I= 7, 12) /5.333333333333333D+00 - $ ,1.600000000000000D+01,-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ + DATA (CF(I),I= 64, 68) /48,-12,-4,-4,12/ C 1 T(1,5,3,6) T(2,4) - DATA (CF(I, 9),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 9),I= 7, 12) /-6.666666666666666D-01, - $ -2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 69, 72) /48,32,32,12/ C 1 T(2,4) T(5,1,3,6) - DATA (CF(I, 10),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ - DATA (CF(I, 10),I= 7, 12) /-2.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 73, 75) /48,12,32/ C 1 T(2,6) T(5,1,3,4) - DATA (CF(I, 11),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,-2.000000000000000D+00/ - DATA (CF(I, 11),I= 7, 12) /2.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(3,4) T(5,1,2,6) - DATA (CF(I, 12),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,-2.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 12),I= 7, 12) /-6.666666666666666D-01 - $ ,2.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(3,6) T(5,1,2,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -758,10 +694,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -770,6 +708,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index d34888db6a..fcc3d8d8af 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,20 +103,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -173,57 +171,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -231,377 +284,147 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 7 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - FFV1P0_3( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 2 OF 7 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 3 OF 7 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 7 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[3], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 5 OF 7 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[1], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 6 OF 7 *** - - // Wavefunction(s) for diagram number 6 - FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 7 OF 7 *** - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uc_ttxuc()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 7 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -688,7 +511,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -723,6 +550,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -765,6 +596,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -867,26 +702,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -894,25 +729,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1048,22 +1087,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1075,17 +1108,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1111,93 +1147,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1239,7 +1245,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1262,7 +1268,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1271,25 +1277,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1299,8 +1311,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1316,11 +1330,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1422,14 +1437,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 08510dfc85..c26e439a36 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -80,17 +81,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 7; //static const int ncomb = 64; // CPPProcess::ncomb @@ -127,23 +128,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -157,34 +161,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f index bb9d2c55fb..6aea556b29 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f index b76b7c4456..d676a45fad 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -142,7 +142,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -151,7 +151,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) S2=PDG2PDF(LPP(IB(2)),3, IB(2),XBK(IB(2)), QSCALE) @@ -243,7 +243,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -321,6 +321,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -406,20 +410,20 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -555,6 +559,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc new file mode 100644 index 0000000000..087618686d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagrams.h new file mode 100644 index 0000000000..9f93aa2532 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagrams.h @@ -0,0 +1,247 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 7 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + FFV1P0_3( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 7 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 7 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 7 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[3], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 7 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[1], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 7 *** + // Wavefunction(s) for diagram number 6 + FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 7 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f index bfe665d186..67afdb3cae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -408,7 +408,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(8) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -451,39 +452,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(3,1) T(5,2) T(6,4) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(3,1) T(5,4) T(6,2) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(3,2) T(5,1) T(6,4) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(3,2) T(5,4) T(6,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(3,4) T(5,1) T(6,2) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(3,4) T(5,2) T(6,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -553,10 +547,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -565,6 +561,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index 66e4b80f71..ee32f26811 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -107,20 +109,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -179,57 +177,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -237,377 +290,147 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 7 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - FFV1P0_3( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 2 OF 7 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 3 OF 7 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 7 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 5 OF 7 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 6 OF 7 *** - - // Wavefunction(s) for diagram number 6 - FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 7 OF 7 *** - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_ucx_ttxucx()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 7 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -694,7 +517,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -729,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -771,6 +602,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -873,26 +708,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -900,25 +735,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1054,22 +1093,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1081,17 +1114,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1117,93 +1153,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1245,7 +1251,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1268,7 +1274,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1277,25 +1283,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1305,8 +1317,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1322,11 +1336,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1428,14 +1443,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index 04b9f5bcb1..75c705a855 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -86,17 +87,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 7; //static const int ncomb = 64; // CPPProcess::ncomb @@ -133,23 +134,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -163,34 +167,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f index 5046df7e56..b693098acb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f index 848991a32a..87348bace0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -148,7 +148,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -158,7 +158,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -269,7 +269,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -353,6 +353,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -438,24 +442,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -627,6 +631,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc new file mode 100644 index 0000000000..087618686d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagrams.h new file mode 100644 index 0000000000..3e70524053 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagrams.h @@ -0,0 +1,247 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 7 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + FFV1P0_3( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 7 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 7 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 7 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 7 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 7 *** + // Wavefunction(s) for diagram number 6 + FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 7 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f index 5dcb5155f3..210248dac7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -366,7 +366,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -420,7 +420,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(8) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -463,39 +464,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(2,1) T(3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(2,1) T(3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(2,4) T(3,1) T(5,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(2,4) T(3,6) T(5,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(2,6) T(3,1) T(5,4) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(2,6) T(3,4) T(5,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -565,10 +559,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -577,6 +573,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 8d266e82b7..50c33f72e9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,489 +282,161 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 14 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - FFV1P0_3( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 2 OF 14 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 3 OF 14 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 14 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[9], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 5 OF 14 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[1], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 6 OF 14 *** - - // Wavefunction(s) for diagram number 6 - FFV1P0_3( w_fp[0], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1_1( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 7 OF 14 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 8 OF 14 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 14 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 10 OF 14 *** - - // Wavefunction(s) for diagram number 10 - FFV1_1( w_fp[4], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[1], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 11 OF 14 *** - - // Wavefunction(s) for diagram number 11 - FFV1_2( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[10], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - // *** DIAGRAM 12 OF 14 *** - - // Wavefunction(s) for diagram number 12 - FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[10], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 13 OF 14 *** - - // Wavefunction(s) for diagram number 13 - FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[6], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 14 OF 14 *** - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uu_ttxuu()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 14 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -798,7 +523,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -833,6 +562,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -875,6 +608,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -977,26 +714,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1004,25 +741,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1160,20 +1101,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1185,17 +1120,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1221,93 +1159,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1349,7 +1257,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1372,7 +1280,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1381,25 +1289,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1409,8 +1323,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1426,11 +1342,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1532,14 +1449,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index fd123d932d..8b71fbebc8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 14; //static const int ncomb = 64; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f index 77164138e6..87bbc98a81 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f index f03c7f3b0c..8712e90238 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE) U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc new file mode 100644 index 0000000000..087618686d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagrams.h new file mode 100644 index 0000000000..9f38cec61a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagrams.h @@ -0,0 +1,471 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 14 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + ixxxxx( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + oxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + FFV1P0_3( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 14 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 14 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 14 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[9], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 14 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[1], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 14 *** + // Wavefunction(s) for diagram number 6 + FFV1P0_3( w_fp[0], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1_1( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 14 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 14 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 14 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 14 *** + // Wavefunction(s) for diagram number 10 + FFV1_1( w_fp[4], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[1], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 14 *** + // Wavefunction(s) for diagram number 11 + FFV1_2( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[10], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 14 *** + // Wavefunction(s) for diagram number 12 + FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[10], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 14 *** + // Wavefunction(s) for diagram number 13 + FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[6], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 14 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f index 8b80833180..ca1ea52d2b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(16) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,39 +448,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(3,1) T(5,2) T(6,4) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(3,1) T(5,4) T(6,2) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(3,2) T(5,1) T(6,4) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(3,2) T(5,4) T(6,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(3,4) T(5,1) T(6,2) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(3,4) T(5,2) T(6,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -585,10 +579,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -597,6 +593,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index 1b918bae84..a00dd1fdde 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -107,20 +109,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -179,57 +177,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -237,377 +290,147 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 7 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - - // *** DIAGRAM 2 OF 7 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - - // *** DIAGRAM 3 OF 7 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 7 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 5 OF 7 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 6 OF 7 *** - - // Wavefunction(s) for diagram number 6 - FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 7 OF 7 *** - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[3], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uux_ttxccx()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 7 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -694,7 +517,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -729,6 +556,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -771,6 +602,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -873,26 +708,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -900,25 +735,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1054,22 +1093,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1081,17 +1114,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1117,93 +1153,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1245,7 +1251,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1268,7 +1274,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1277,25 +1283,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1305,8 +1317,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1322,11 +1336,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1428,14 +1443,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index 87faf25dfb..17302e0d54 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -86,17 +87,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 7; //static const int ncomb = 64; // CPPProcess::ncomb @@ -133,23 +134,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -163,34 +167,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f index e3f26606a1..41ac73e027 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f index 74f009d272..92abff14ad 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -148,7 +148,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -158,7 +158,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -269,7 +269,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -353,6 +353,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -438,24 +442,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -627,6 +631,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc new file mode 100644 index 0000000000..087618686d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagrams.h new file mode 100644 index 0000000000..57e2446ba9 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagrams.h @@ -0,0 +1,247 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 7 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 7 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 7 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 7 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 7 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 7 *** + // Wavefunction(s) for diagram number 6 + FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 7 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[3], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f index 728711155f..6dffcf0951 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -366,7 +366,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -420,7 +420,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(8) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -463,39 +464,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(2,1) T(3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(2,1) T(3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(2,4) T(3,1) T(5,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(2,4) T(3,6) T(5,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(2,6) T(3,1) T(5,4) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(2,6) T(3,4) T(5,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -565,10 +559,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -577,6 +573,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 1c575b7757..2863f773a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,804 +282,205 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 36 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); - - VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 36 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 36 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 36 *** - - // Wavefunction(s) for diagram number 4 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 36 *** - - // Wavefunction(s) for diagram number 5 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 36 *** - - // Wavefunction(s) for diagram number 6 - FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 7 OF 36 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[10] ); - FFV1P0_3( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[0], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 8 OF 36 *** - - // Wavefunction(s) for diagram number 8 - FFV1_2( w_fp[0], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 9 OF 36 *** - - // Wavefunction(s) for diagram number 9 - FFV1_1( w_fp[9], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 10 OF 36 *** - - // Wavefunction(s) for diagram number 10 - // (none) - - // Amplitude(s) for diagram number 10 - VVV1_0( w_fp[5], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 36 *** - - // Wavefunction(s) for diagram number 11 - FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 12 OF 36 *** - - // Wavefunction(s) for diagram number 12 - FFV1P0_3( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[0], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 13 OF 36 *** - - // Wavefunction(s) for diagram number 13 - // (none) - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - jamp_sv[11] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 14 OF 36 *** - - // Wavefunction(s) for diagram number 14 - FFV1_2( w_fp[11], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 15 OF 36 *** - - // Wavefunction(s) for diagram number 15 - // (none) - - // Amplitude(s) for diagram number 15 - VVV1_0( w_fp[5], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 16 OF 36 *** - - // Wavefunction(s) for diagram number 16 - FFV1_1( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[0], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += 1. / 6. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 17 OF 36 *** - - // Wavefunction(s) for diagram number 17 - // (none) - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[8] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 18 OF 36 *** - - // Wavefunction(s) for diagram number 18 - // (none) - - // Amplitude(s) for diagram number 18 - FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 6. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 19 OF 36 *** - - // Wavefunction(s) for diagram number 19 - FFV1_1( w_fp[9], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[0], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 20 OF 36 *** - - // Wavefunction(s) for diagram number 20 - // (none) - - // Amplitude(s) for diagram number 20 - VVV1_0( w_fp[5], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 21 OF 36 *** - - // Wavefunction(s) for diagram number 21 - FFV1_2( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] ); - FFV1P0_3( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[8] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 22 OF 36 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] += 1. / 6. * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 23 OF 36 *** - - // Wavefunction(s) for diagram number 23 - // (none) - - // Amplitude(s) for diagram number 23 - FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += 1. / 6. * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 24 OF 36 *** - - // Wavefunction(s) for diagram number 24 - FFV1_2( w_fp[14], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 25 OF 36 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 26 OF 36 *** - - // Wavefunction(s) for diagram number 26 - FFV1_1( w_fp[13], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 27 OF 36 *** - - // Wavefunction(s) for diagram number 27 - VVV1P0_1( w_fp[4], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] ); - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 28 OF 36 *** - - // Wavefunction(s) for diagram number 28 - FFV1_2( w_fp[6], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 28 - FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * amp_sv[0]; - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 29 OF 36 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 30 OF 36 *** - - // Wavefunction(s) for diagram number 30 - FFV1_1( w_fp[10], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 30 - FFV1_0( w_fp[0], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 31 OF 36 *** - - // Wavefunction(s) for diagram number 31 - VVV1P0_1( w_fp[4], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 31 - FFV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 32 OF 36 *** - - // Wavefunction(s) for diagram number 32 - FFV1_2( w_fp[12], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 33 OF 36 *** - - // Wavefunction(s) for diagram number 33 - // (none) - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 34 OF 36 *** - - // Wavefunction(s) for diagram number 34 - // (none) - - // Amplitude(s) for diagram number 34 - VVVV1_0( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - VVVV3_0( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - VVVV4_0( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 35 OF 36 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - VVV1_0( w_fp[5], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 36 OF 36 *** - - // Wavefunction(s) for diagram number 36 - // (none) - - // Amplitude(s) for diagram number 36 - VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uux_ttxgg()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, -6, 16, -2, 16, -2, 6, 6, 0, -2, 16, 0 }, - { -6, 48, -2, 16, -2, 16, 6, 6, 0, 16, -2, 0 }, - { 16, -2, 48, -6, 6, 6, 16, -2, -2, 0, 0, 16 }, - { -2, 16, -6, 48, 6, 6, -2, 16, 16, 0, 0, -2 }, - { 16, -2, 6, 6, 48, -6, 16, -2, 16, 0, 0, -2 }, - { -2, 16, 6, 6, -6, 48, -2, 16, -2, 0, 0, 16 }, - { 6, 6, 16, -2, 16, -2, 48, -6, 0, 16, -2, 0 }, - { 6, 6, -2, 16, -2, 16, -6, 48, 0, -2, 16, 0 }, - { 0, 0, -2, 16, 16, -2, 0, 0, 48, 16, 16, 6 }, - { -2, 16, 0, 0, 0, 0, 16, -2, 16, 48, 6, 16 }, - { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 }, - { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 36 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -1113,7 +567,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1148,6 +606,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1190,6 +652,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1292,26 +758,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1319,25 +785,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1473,22 +1143,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1500,17 +1164,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1536,93 +1203,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1664,7 +1301,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1687,7 +1324,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1696,25 +1333,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1724,8 +1367,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1741,11 +1386,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1847,14 +1493,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 0689624568..8906bee944 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 38; //static const int ncomb = 64; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f index 5787ba42b2..9f0a834688 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f index 75d947b792..02fad5c3ba 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc new file mode 100644 index 0000000000..ffbf0d5f94 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc @@ -0,0 +1,393 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, -6, 16, -2, 16, -2, 6, 6, 0, -2, 16, 0 }, + { -6, 48, -2, 16, -2, 16, 6, 6, 0, 16, -2, 0 }, + { 16, -2, 48, -6, 6, 6, 16, -2, -2, 0, 0, 16 }, + { -2, 16, -6, 48, 6, 6, -2, 16, 16, 0, 0, -2 }, + { 16, -2, 6, 6, 48, -6, 16, -2, 16, 0, 0, -2 }, + { -2, 16, 6, 6, -6, 48, -2, 16, -2, 0, 0, 16 }, + { 6, 6, 16, -2, 16, -2, 48, -6, 0, 16, -2, 0 }, + { 6, 6, -2, 16, -2, 16, -6, 48, 0, -2, 16, 0 }, + { 0, 0, -2, 16, 16, -2, 0, 0, 48, 16, 16, 6 }, + { -2, 16, 0, 0, 0, 0, 16, -2, 16, 48, 6, 16 }, + { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 }, + { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagrams.h new file mode 100644 index 0000000000..0dd99001f6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagrams.h @@ -0,0 +1,1132 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 36 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + vxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + vxxxxx( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 ); + VVV1P0_1( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 36 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 36 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 36 *** + // Wavefunction(s) for diagram number 4 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 36 *** + // Wavefunction(s) for diagram number 5 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 36 *** + // Wavefunction(s) for diagram number 6 + FFV1_1( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + FFV1_2( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 36 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[10] ); + FFV1P0_3( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[0], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 36 *** + // Wavefunction(s) for diagram number 8 + FFV1_2( w_fp[0], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 36 *** + // Wavefunction(s) for diagram number 9 + FFV1_1( w_fp[9], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 36 *** + // Wavefunction(s) for diagram number 10 + // (none) + // Amplitude(s) for diagram number 10 + VVV1_0( w_fp[5], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 36 *** + // Wavefunction(s) for diagram number 11 + FFV1_2( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + FFV1_1( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 36 *** + // Wavefunction(s) for diagram number 12 + FFV1P0_3( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[0], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 36 *** + // Wavefunction(s) for diagram number 13 + // (none) + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 36 *** + // Wavefunction(s) for diagram number 14 + FFV1_2( w_fp[11], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 36 *** + // Wavefunction(s) for diagram number 15 + // (none) + // Amplitude(s) for diagram number 15 + VVV1_0( w_fp[5], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 36 *** + // Wavefunction(s) for diagram number 16 + FFV1_1( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[0], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 36 *** + // Wavefunction(s) for diagram number 17 + // (none) + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 36 *** + // Wavefunction(s) for diagram number 18 + // (none) + // Amplitude(s) for diagram number 18 + FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 36 *** + // Wavefunction(s) for diagram number 19 + FFV1_1( w_fp[9], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[0], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 36 *** + // Wavefunction(s) for diagram number 20 + // (none) + // Amplitude(s) for diagram number 20 + VVV1_0( w_fp[5], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 36 *** + // Wavefunction(s) for diagram number 21 + FFV1_2( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] ); + FFV1P0_3( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 36 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 36 *** + // Wavefunction(s) for diagram number 23 + // (none) + // Amplitude(s) for diagram number 23 + FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 36 *** + // Wavefunction(s) for diagram number 24 + FFV1_2( w_fp[14], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 36 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 36 *** + // Wavefunction(s) for diagram number 26 + FFV1_1( w_fp[13], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] ); + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 36 *** + // Wavefunction(s) for diagram number 27 + VVV1P0_1( w_fp[4], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] ); + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 36 *** + // Wavefunction(s) for diagram number 28 + FFV1_2( w_fp[6], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 28 + FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 36 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 36 *** + // Wavefunction(s) for diagram number 30 + FFV1_1( w_fp[10], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 30 + FFV1_0( w_fp[0], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 36 *** + // Wavefunction(s) for diagram number 31 + VVV1P0_1( w_fp[4], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 31 + FFV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 36 *** + // Wavefunction(s) for diagram number 32 + FFV1_2( w_fp[12], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 36 *** + // Wavefunction(s) for diagram number 33 + // (none) + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 36 *** + // Wavefunction(s) for diagram number 34 + // (none) + // Amplitude(s) for diagram number 34 + VVVV1_0( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + VVVV3_0( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + VVVV4_0( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 36 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + VVV1_0( w_fp[5], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 36 *** + // Wavefunction(s) for diagram number 36 + // (none) + // Amplitude(s) for diagram number 36 + VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f index 65c377ffc0..563e5bc0a0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(17) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,111 +448,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01, - $ -2.000000000000000D+00,5.333333333333333D+00, - $ -6.666666666666666D-01,5.333333333333333D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 1),I= 7, 12) /2.000000000000000D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,-12,32,-4,32,-4,12,12,0,-4,32,0/ C 1 T(2,1) T(5,6,3,4) - DATA (CF(I, 2),I= 1, 6) /-2.000000000000000D+00 - $ ,1.600000000000000D+01,-6.666666666666666D-01 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,5.333333333333333D+00/ - DATA (CF(I, 2),I= 7, 12) /2.000000000000000D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 13, 23) /48,-4,32,-4,32,12,12,0,32,-4,0/ C 1 T(2,1) T(6,5,3,4) - DATA (CF(I, 3),I= 1, 6) /5.333333333333333D+00, - $ -6.666666666666666D-01,1.600000000000000D+01, - $ -2.000000000000000D+00,2.000000000000000D+00,2.000000000000000D - $ +00/ - DATA (CF(I, 3),I= 7, 12) /5.333333333333333D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 24, 33) /48,-12,12,12,32,-4,-4,0,0,32/ C 1 T(2,4) T(5,6,3,1) - DATA (CF(I, 4),I= 1, 6) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,-2.000000000000000D+00 - $ ,1.600000000000000D+01,2.000000000000000D+00,2.000000000000000D - $ +00/ - DATA (CF(I, 4),I= 7, 12) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 34, 42) /48,12,12,-4,32,32,0,0,-4/ C 1 T(2,4) T(6,5,3,1) - DATA (CF(I, 5),I= 1, 6) /5.333333333333333D+00, - $ -6.666666666666666D-01,2.000000000000000D+00,2.000000000000000D - $ +00,1.600000000000000D+01,-2.000000000000000D+00/ - DATA (CF(I, 5),I= 7, 12) /5.333333333333333D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 43, 50) /48,-12,32,-4,32,0,0,-4/ C 1 T(3,1) T(5,6,2,4) - DATA (CF(I, 6),I= 1, 6) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,2.000000000000000D+00,2.000000000000000D - $ +00,-2.000000000000000D+00,1.600000000000000D+01/ - DATA (CF(I, 6),I= 7, 12) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 51, 57) /48,-4,32,-4,0,0,32/ C 1 T(3,1) T(6,5,2,4) - DATA (CF(I, 7),I= 1, 6) /2.000000000000000D+00 - $ ,2.000000000000000D+00,5.333333333333333D+00, - $ -6.666666666666666D-01,5.333333333333333D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01, - $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 58, 63) /48,-12,0,32,-4,0/ C 1 T(3,4) T(5,6,2,1) - DATA (CF(I, 8),I= 1, 6) /2.000000000000000D+00 - $ ,2.000000000000000D+00,-6.666666666666666D-01 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,5.333333333333333D+00/ - DATA (CF(I, 8),I= 7, 12) /-2.000000000000000D+00 - $ ,1.600000000000000D+01,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 64, 68) /48,0,-4,32,0/ C 1 T(3,4) T(6,5,2,1) - DATA (CF(I, 9),I= 1, 6) /0.000000000000000D+00 - $ ,0.000000000000000D+00,-6.666666666666666D-01 - $ ,5.333333333333333D+00,5.333333333333333D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 9),I= 7, 12) /0.000000000000000D+00 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 69, 72) /48,32,32,12/ C 1 T(5,2,1) T(6,3,4) - DATA (CF(I, 10),I= 1, 6) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,0.000000000000000D+00,0.000000000000000D - $ +00,0.000000000000000D+00,0.000000000000000D+00/ - DATA (CF(I, 10),I= 7, 12) /5.333333333333333D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 73, 75) /48,12,32/ C 1 T(5,2,4) T(6,3,1) - DATA (CF(I, 11),I= 1, 6) /5.333333333333333D+00, - $ -6.666666666666666D-01,0.000000000000000D+00,0.000000000000000D - $ +00,0.000000000000000D+00,0.000000000000000D+00/ - DATA (CF(I, 11),I= 7, 12) /-6.666666666666666D-01 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(5,3,1) T(6,2,4) - DATA (CF(I, 12),I= 1, 6) /0.000000000000000D+00 - $ ,0.000000000000000D+00,5.333333333333333D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,5.333333333333333D+00/ - DATA (CF(I, 12),I= 7, 12) /0.000000000000000D+00 - $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(5,3,4) T(6,2,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -761,10 +695,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -773,6 +709,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index e6d6423d5e..e0b9996ffc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,489 +282,161 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 14 *** - - // Wavefunction(s) for diagram number 1 - ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - - // *** DIAGRAM 2 OF 14 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - - // *** DIAGRAM 3 OF 14 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 14 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[9], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 5 OF 14 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 6 OF 14 *** - - // Wavefunction(s) for diagram number 6 - FFV1P0_3( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1_1( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 7 OF 14 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 8 OF 14 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 14 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 10 OF 14 *** - - // Wavefunction(s) for diagram number 10 - FFV1_1( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 11 OF 14 *** - - // Wavefunction(s) for diagram number 11 - FFV1_2( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - // *** DIAGRAM 12 OF 14 *** - - // Wavefunction(s) for diagram number 12 - FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[10], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 13 OF 14 *** - - // Wavefunction(s) for diagram number 13 - FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[6], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 14 OF 14 *** - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 36. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uux_ttxuux()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 14 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -798,7 +523,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -833,6 +562,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -875,6 +608,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -977,26 +714,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1004,25 +741,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1158,22 +1099,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1185,17 +1120,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1221,93 +1159,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1349,7 +1257,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1372,7 +1280,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1381,25 +1289,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1409,8 +1323,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1426,11 +1342,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1532,14 +1449,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index de4fd12c37..515a957ce5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 14; //static const int ncomb = 64; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f index 639c7207e3..806033a9ec 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f index 8fc5eeb386..e841acfd24 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE) U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc new file mode 100644 index 0000000000..087618686d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagrams.h new file mode 100644 index 0000000000..7419b50278 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagrams.h @@ -0,0 +1,471 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 14 *** + // Wavefunction(s) for diagram number 1 + ixxxxx( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + FFV1P0_3( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 14 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 14 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 14 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[9], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 14 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 14 *** + // Wavefunction(s) for diagram number 6 + FFV1P0_3( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1_1( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 14 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 14 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 14 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 14 *** + // Wavefunction(s) for diagram number 10 + FFV1_1( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 14 *** + // Wavefunction(s) for diagram number 11 + FFV1_2( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 14 *** + // Wavefunction(s) for diagram number 12 + FFV1_2( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[10], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 14 *** + // Wavefunction(s) for diagram number 13 + FFV1_2( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[6], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 14 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f index 9a6d844439..e4cc5c2814 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(16) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,39 +448,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(2,1) T(3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(2,1) T(3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(2,4) T(3,1) T(5,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(2,4) T(3,6) T(5,1) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(2,6) T(3,1) T(5,4) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(2,6) T(3,4) T(5,1) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -585,10 +579,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -597,6 +593,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index bf560d981f..2cd230128e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,20 +103,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -173,57 +171,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -231,377 +284,147 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 7 *** - - // Wavefunction(s) for diagram number 1 - oxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - FFV1P0_3( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - - // *** DIAGRAM 2 OF 7 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 3 OF 7 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 7 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 5 OF 7 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 6 OF 7 *** - - // Wavefunction(s) for diagram number 6 - FFV1_2( w_fp[4], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 7 OF 7 *** - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[4], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[3], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uxcx_ttxuxcx()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 7 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -688,7 +511,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -723,6 +550,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -765,6 +596,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -867,26 +702,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -894,25 +729,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1048,22 +1087,16 @@ namespace mg5amcCpu // These variable are not used anywhere else in the code and their scope is limited to this sanity check { // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) - constexpr int nprocesses = 2; + constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1075,17 +1108,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1111,93 +1147,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1239,7 +1245,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1262,7 +1268,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1271,25 +1277,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1299,8 +1311,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1316,11 +1330,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1422,14 +1437,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index 13a02cdb83..e99911c34f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -80,17 +81,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 7; //static const int ncomb = 64; // CPPProcess::ncomb @@ -127,23 +128,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -157,34 +161,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f index bf9951e502..a781041f7d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f index 24b0abb30c..d25e751436 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -142,7 +142,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF CX1=PDG2PDF(LPP(IB(1)),-4, IB(1),XBK(IB(1)), QSCALE) UX1=PDG2PDF(LPP(IB(1)),-2, IB(1),XBK(IB(1)), QSCALE) @@ -151,7 +151,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -243,7 +243,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -321,6 +321,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -406,20 +410,20 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) CX1(IVEC)=PDG2PDF(LPP(IB(1)),-4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) UX1(IVEC)=PDG2PDF(LPP(IB(1)),-2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) DX1(IVEC)=PDG2PDF(LPP(IB(1)),-1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -555,6 +559,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc new file mode 100644 index 0000000000..087618686d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagrams.h new file mode 100644 index 0000000000..0c601d8e61 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagrams.h @@ -0,0 +1,247 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 7 *** + // Wavefunction(s) for diagram number 1 + oxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + FFV1P0_3( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 7 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 7 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 7 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 7 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 7 *** + // Wavefunction(s) for diagram number 6 + FFV1_2( w_fp[4], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 7 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[4], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[3], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f index 2a76dfeffb..14d46077f0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -408,7 +408,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(8) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -451,39 +452,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(1,4) T(2,5) T(3,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(1,4) T(2,6) T(3,5) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(1,5) T(2,4) T(3,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(1,5) T(2,6) T(3,4) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(1,6) T(2,4) T(3,5) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(1,6) T(2,5) T(3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -553,10 +547,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -565,6 +561,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index 83faf9192b..7fe0dd7a98 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,20 +101,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_sm_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 6; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -171,57 +169,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -229,489 +282,161 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 14 *** - - // Wavefunction(s) for diagram number 1 - oxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); - - ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); - - FFV1P0_3( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - - // *** DIAGRAM 2 OF 14 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 3 OF 14 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 3 - VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 4 OF 14 *** - - // Wavefunction(s) for diagram number 4 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[9], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 5 OF 14 *** - - // Wavefunction(s) for diagram number 5 - FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[2] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 6 OF 14 *** - - // Wavefunction(s) for diagram number 6 - FFV1P0_3( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[9] ); - FFV1P0_3( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); - FFV1_1( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); - - // Amplitude(s) for diagram number 6 - FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 7 OF 14 *** - - // Wavefunction(s) for diagram number 7 - FFV1_2( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 8 OF 14 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 14 *** - - // Wavefunction(s) for diagram number 9 - FFV1_2( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[10], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 10 OF 14 *** - - // Wavefunction(s) for diagram number 10 - FFV1_1( w_fp[0], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 12. * amp_sv[0]; - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 11 OF 14 *** - - // Wavefunction(s) for diagram number 11 - FFV1_2( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[2] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - // *** DIAGRAM 12 OF 14 *** - - // Wavefunction(s) for diagram number 12 - FFV1_2( w_fp[4], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[10], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[1] -= 1. / 4. * amp_sv[0]; - jamp_sv[3] += 1. / 12. * amp_sv[0]; - jamp_sv[4] += 1. / 12. * amp_sv[0]; - jamp_sv[5] -= 1. / 36. * amp_sv[0]; - - // *** DIAGRAM 13 OF 14 *** - - // Wavefunction(s) for diagram number 13 - FFV1_2( w_fp[4], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] ); - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[6], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] += 1. / 4. * amp_sv[0]; - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** DIAGRAM 14 OF 14 *** - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[10], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 12. * amp_sv[0]; - jamp_sv[3] += 1. / 36. * amp_sv[0]; - jamp_sv[4] += 1. / 4. * amp_sv[0]; - jamp_sv[5] -= 1. / 12. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_2_uxux_ttxuxux()?) - - // The color denominators (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] - - // The color matrix (initialize all array elements, with ncolor=6) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 27, 9, 9, 3, 3, 9 }, - { 9, 27, 3, 9, 9, 3 }, - { 9, 3, 27, 9, 9, 3 }, - { 3, 9, 9, 27, 3, 9 }, - { 3, 9, 9, 3, 27, 9 }, - { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 14 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -798,7 +523,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -833,6 +562,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -875,6 +608,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_sm::mdl_MT ); m_masses.push_back( Parameters_sm::ZERO ); m_masses.push_back( Parameters_sm::ZERO ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -977,26 +714,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1004,25 +741,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1160,20 +1101,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1185,17 +1120,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1221,93 +1159,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1349,7 +1257,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1372,7 +1280,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1381,25 +1289,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1409,8 +1323,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1426,11 +1342,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1532,14 +1449,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 0b67fca178..ee62f5cc48 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_sm.h" #include @@ -78,17 +79,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 14; //static const int ncomb = 64; // CPPProcess::ncomb @@ -125,23 +126,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -155,34 +159,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f index f8d2319067..dcc832fcc1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f index f9adb0c2a2..4bef7f631c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF CX1=PDG2PDF(LPP(IB(1)),-4, IB(1),XBK(IB(1)), QSCALE) SX1=PDG2PDF(LPP(IB(1)),-3, IB(1),XBK(IB(1)), QSCALE) @@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE) SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE) @@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) CX1(IVEC)=PDG2PDF(LPP(IB(1)),-4, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) SX1(IVEC)=PDG2PDF(LPP(IB(1)),-3, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) UX1(IVEC)=PDG2PDF(LPP(IB(1)),-2, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) DX1(IVEC)=PDG2PDF(LPP(IB(1)),-1, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc new file mode 100644 index 0000000000..087618686d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc @@ -0,0 +1,387 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6] + + // The color matrix (initialize all array elements, with ncolor=6) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 27, 9, 9, 3, 3, 9 }, + { 9, 27, 3, 9, 9, 3 }, + { 9, 3, 27, 9, 9, 3 }, + { 3, 9, 9, 27, 3, 9 }, + { 3, 9, 9, 3, 27, 9 }, + { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagrams.h new file mode 100644 index 0000000000..146b88ee10 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagrams.h @@ -0,0 +1,471 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 14 *** + // Wavefunction(s) for diagram number 1 + oxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + oxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + ixxxxx( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 ); + ixxxxx( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 ); + FFV1P0_3( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 14 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 14 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 3 + VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 14 *** + // Wavefunction(s) for diagram number 4 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[9], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 14 *** + // Wavefunction(s) for diagram number 5 + FFV1_1( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 14 *** + // Wavefunction(s) for diagram number 6 + FFV1P0_3( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[9] ); + FFV1P0_3( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] ); + FFV1_1( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); + // Amplitude(s) for diagram number 6 + FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 14 *** + // Wavefunction(s) for diagram number 7 + FFV1_2( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 14 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 14 *** + // Wavefunction(s) for diagram number 9 + FFV1_2( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[10], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 14 *** + // Wavefunction(s) for diagram number 10 + FFV1_1( w_fp[0], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 14 *** + // Wavefunction(s) for diagram number 11 + FFV1_2( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 14 *** + // Wavefunction(s) for diagram number 12 + FFV1_2( w_fp[4], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[10], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 14 *** + // Wavefunction(s) for diagram number 13 + FFV1_2( w_fp[4], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] ); + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[6], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 14 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[10], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f index 35761964e7..a4ee136e47 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(16) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -447,39 +448,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /2.700000000000000D+01 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,3.000000000000000D+00,9.000000000000000D+00/ + DATA DENOM/1/ + DATA (CF(I),I= 1, 6) /27,18,18,6,6,18/ C 1 T(1,4) T(2,5) T(3,6) - DATA (CF(I, 2),I= 1, 6) /9.000000000000000D+00 - $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 7, 11) /27,6,18,18,6/ C 1 T(1,4) T(2,6) T(3,5) - DATA (CF(I, 3),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D - $ +00,9.000000000000000D+00,3.000000000000000D+00/ + DATA (CF(I),I= 12, 15) /27,18,18,6/ C 1 T(1,5) T(2,4) T(3,6) - DATA (CF(I, 4),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D - $ +01,3.000000000000000D+00,9.000000000000000D+00/ + DATA (CF(I),I= 16, 18) /27,6,18/ C 1 T(1,5) T(2,6) T(3,4) - DATA (CF(I, 5),I= 1, 6) /3.000000000000000D+00 - $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D - $ +00,2.700000000000000D+01,9.000000000000000D+00/ + DATA (CF(I),I= 19, 20) /27,18/ C 1 T(1,6) T(2,4) T(3,5) - DATA (CF(I, 6),I= 1, 6) /9.000000000000000D+00 - $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D - $ +00,9.000000000000000D+00,2.700000000000000D+01/ + DATA (CF(I),I= 21, 21) /27/ C 1 T(1,6) T(2,5) T(3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -585,10 +579,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -597,6 +593,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/cluster.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data b/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py index 57a85b0614..33a89259f8 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py @@ -116,9 +116,10 @@ def write_param(self, param, lhablock): def write_dep_param_block(self, lhablock): import cmath from parameters import all_parameters + param_values = {'cmath':cmath} for parameter in all_parameters: try: - exec("%s = %s" % (parameter.name, parameter.value)) + exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values) except Exception: pass text = "## Not dependent paramater.\n" @@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock): prefix = "DECAY " for part, param in data: if isinstance(param.value, str): - value = complex(eval(param.value)).real + value = complex(eval(param.value, globals(), param_values)).real else: value = param.value diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/madevent b/epochX/cudacpp/pp_tt012j.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/madevent +++ b/epochX/cudacpp/pp_tt012j.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h index 53dd560ed6..c30f753dcb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -963,7 +963,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -976,7 +976,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -990,7 +990,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1003,7 +1003,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1017,7 +1017,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV1_0( const fptype allV1[], const fptype allV2[], @@ -1030,7 +1030,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1053,7 +1053,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1066,7 +1066,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1091,7 +1091,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1104,7 +1104,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1116,7 +1116,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1129,7 +1129,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1148,7 +1148,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1161,7 +1161,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1180,7 +1180,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1193,7 +1193,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1211,7 +1211,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -1226,7 +1226,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] ); @@ -1241,7 +1241,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -1256,7 +1256,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1276,7 +1276,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV3_0( const fptype allV1[], const fptype allV2[], @@ -1291,7 +1291,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1306,7 +1306,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV3P0_1( const fptype allV2[], const fptype allV3[], @@ -1321,7 +1321,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; @@ -1341,7 +1341,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV4_0( const fptype allV1[], const fptype allV2[], @@ -1356,7 +1356,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1371,7 +1371,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6] - template + template __device__ void VVVV4P0_1( const fptype allV2[], const fptype allV3[], @@ -1386,7 +1386,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); V1[0] = +V2[0] + V3[0] + V4[0]; diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc index 47a3a011b8..fd5642f3e3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index 76066c7bb1..f4b086fc96 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -310,7 +310,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -320,12 +320,12 @@ namespace mg5amcCpu using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s ); + fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); + cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s ); + cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s ); GC_10s_sv = couplings_sv.GC_10; GC_11s_sv = couplings_sv.GC_11; GC_12s_sv = couplings_sv.GC_12; diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index 85f434b58f..2ebaacfa09 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -73,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.12868547439575195  +DEBUG: model prefixing takes 0.1283280849456787  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -88,21 +88,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.699 s +1 processes with 72 diagrams generated in 3.707 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -114,25 +114,25 @@ FileWriter t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1552]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.189 s -Wrote files for 119 helas calls in 0.388 s +DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1665]  +Generated helas calls for 1 subprocesses (72 diagrams) in 0.185 s +Wrote files for 119 helas calls in 0.383 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.319 s +ALOHA: aloha creates 5 routines in 0.309 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.341 s +ALOHA: aloha creates 10 routines in 0.324 s VVV5 VVV5 FFV1 @@ -142,37 +142,37 @@ ALOHA: aloha creates 10 routines in 0.341 s VVVV1 VVVV9 VVVV10 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses/P1_gg_ttxttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses/P1_gg_ttxttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 275 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m7.169s -user 0m6.853s -sys 0m0.298s +real 0m7.149s +user 0m6.831s +sys 0m0.290s Code generation completed in 7 seconds ************************************************************ * * @@ -186,7 +186,7 @@ Code generation completed in 7 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -194,9 +194,9 @@ Code generation completed in 7 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -216,7 +216,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -224,9 +224,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat index 9bcf8cac8c..1e922c1025 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat index 6b82577032..000832aacd 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat index b8db871c35..85e1d39035 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt b/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f b/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessCouplings.h index a4f3a481bb..84c20a1f30 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h index 2f711d8cc1..7f3a4e3dca 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc index 96d77e5403..42eaa96778 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,1333 +279,277 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 72 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 ); - - VVV5P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 72 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 72 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 3 - VVV5_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 72 *** - - // Wavefunction(s) for diagram number 4 - FFV1P0_3( w_fp[5], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[11] ); - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 72 *** - - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 72 *** - - // Wavefunction(s) for diagram number 6 - FFV1P0_3( w_fp[3], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 6 - VVV5_0( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - jamp_sv[8] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 7 OF 72 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 8 OF 72 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 72 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 10 OF 72 *** - - // Wavefunction(s) for diagram number 10 - // (none) - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 72 *** - - // Wavefunction(s) for diagram number 11 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - FFV1P0_3( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[5] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 12 OF 72 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[5] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 13 OF 72 *** - - // Wavefunction(s) for diagram number 13 - FFV1_1( w_fp[4], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[5] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 14 OF 72 *** - - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[5] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 15 OF 72 *** - - // Wavefunction(s) for diagram number 15 - FFV1_2( w_fp[5], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); - - // Amplitude(s) for diagram number 15 - FFV1_0( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 16 OF 72 *** - - // Wavefunction(s) for diagram number 16 - // (none) - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 17 OF 72 *** - - // Wavefunction(s) for diagram number 17 - FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 18 OF 72 *** - - // Wavefunction(s) for diagram number 18 - // (none) - - // Amplitude(s) for diagram number 18 - VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 19 OF 72 *** - - // Wavefunction(s) for diagram number 19 - // (none) - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[1] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 20 OF 72 *** - - // Wavefunction(s) for diagram number 20 - // (none) - - // Amplitude(s) for diagram number 20 - VVV5_0( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 21 OF 72 *** - - // Wavefunction(s) for diagram number 21 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1P0_3( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 6. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 22 OF 72 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 23 OF 72 *** - - // Wavefunction(s) for diagram number 23 - FFV1P0_3( w_fp[14], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 23 - FFV1_0( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 24 OF 72 *** - - // Wavefunction(s) for diagram number 24 - // (none) - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 25 OF 72 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - FFV1_0( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[6] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 26 OF 72 *** - - // Wavefunction(s) for diagram number 26 - // (none) - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 27 OF 72 *** - - // Wavefunction(s) for diagram number 27 - FFV1_2( w_fp[14], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 28 OF 72 *** - - // Wavefunction(s) for diagram number 28 - // (none) - - // Amplitude(s) for diagram number 28 - VVV5_0( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 29 OF 72 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 30 OF 72 *** - - // Wavefunction(s) for diagram number 30 - // (none) - - // Amplitude(s) for diagram number 30 - VVV5_0( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 31 OF 72 *** - - // Wavefunction(s) for diagram number 31 - FFV1_1( w_fp[4], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); - - // Amplitude(s) for diagram number 31 - FFV1_0( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 32 OF 72 *** - - // Wavefunction(s) for diagram number 32 - FFV1P0_3( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 33 OF 72 *** - - // Wavefunction(s) for diagram number 33 - // (none) - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 34 OF 72 *** - - // Wavefunction(s) for diagram number 34 - // (none) - - // Amplitude(s) for diagram number 34 - FFV1_0( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 35 OF 72 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - FFV1_0( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 36 OF 72 *** - - // Wavefunction(s) for diagram number 36 - // (none) - - // Amplitude(s) for diagram number 36 - FFV1_0( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - jamp_sv[6] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 37 OF 72 *** - - // Wavefunction(s) for diagram number 37 - FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 37 - FFV1_0( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 38 OF 72 *** - - // Wavefunction(s) for diagram number 38 - // (none) - - // Amplitude(s) for diagram number 38 - VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 39 OF 72 *** - - // Wavefunction(s) for diagram number 39 - // (none) - - // Amplitude(s) for diagram number 39 - FFV1_0( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 40 OF 72 *** - - // Wavefunction(s) for diagram number 40 - // (none) - - // Amplitude(s) for diagram number 40 - VVV5_0( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 41 OF 72 *** - - // Wavefunction(s) for diagram number 41 - FFV1_2( w_fp[5], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); - FFV1P0_3( w_fp[17], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 41 - FFV1_0( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 42 OF 72 *** - - // Wavefunction(s) for diagram number 42 - // (none) - - // Amplitude(s) for diagram number 42 - FFV1_0( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 43 OF 72 *** - - // Wavefunction(s) for diagram number 43 - FFV1P0_3( w_fp[17], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 43 - FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 44 OF 72 *** - - // Wavefunction(s) for diagram number 44 - // (none) - - // Amplitude(s) for diagram number 44 - FFV1_0( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 45 OF 72 *** - - // Wavefunction(s) for diagram number 45 - // (none) - - // Amplitude(s) for diagram number 45 - FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 6. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 46 OF 72 *** - - // Wavefunction(s) for diagram number 46 - // (none) - - // Amplitude(s) for diagram number 46 - FFV1_0( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 47 OF 72 *** - - // Wavefunction(s) for diagram number 47 - FFV1_2( w_fp[17], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 47 - FFV1_0( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 48 OF 72 *** - - // Wavefunction(s) for diagram number 48 - // (none) - - // Amplitude(s) for diagram number 48 - VVV5_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 48 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 49 OF 72 *** - - // Wavefunction(s) for diagram number 49 - // (none) - - // Amplitude(s) for diagram number 49 - FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[9] += 1. / 6. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 50 OF 72 *** - - // Wavefunction(s) for diagram number 50 - // (none) - - // Amplitude(s) for diagram number 50 - VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 51 OF 72 *** - - // Wavefunction(s) for diagram number 51 - FFV1_1( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 51 - FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 52 OF 72 *** - - // Wavefunction(s) for diagram number 52 - VVV5P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 52 - FFV1_0( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 53 OF 72 *** - - // Wavefunction(s) for diagram number 53 - // (none) - - // Amplitude(s) for diagram number 53 - FFV1_0( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 54 OF 72 *** - - // Wavefunction(s) for diagram number 54 - VVV5P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 54 - FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 55 OF 72 *** - - // Wavefunction(s) for diagram number 55 - FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - - // Amplitude(s) for diagram number 55 - FFV1_0( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 56 OF 72 *** - - // Wavefunction(s) for diagram number 56 - VVV5P0_1( w_fp[0], w_fp[11], COUPs[0], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 56 - FFV1_0( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 57 OF 72 *** - - // Wavefunction(s) for diagram number 57 - // (none) - - // Amplitude(s) for diagram number 57 - FFV1_0( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[2] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 58 OF 72 *** - - // Wavefunction(s) for diagram number 58 - // (none) - - // Amplitude(s) for diagram number 58 - FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 59 OF 72 *** - - // Wavefunction(s) for diagram number 59 - FFV1_1( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 59 - FFV1_0( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 60 OF 72 *** - - // Wavefunction(s) for diagram number 60 - VVV5P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[16] ); - - // Amplitude(s) for diagram number 60 - FFV1_0( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 61 OF 72 *** - - // Wavefunction(s) for diagram number 61 - // (none) - - // Amplitude(s) for diagram number 61 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[10] += 1. / 6. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 62 OF 72 *** - - // Wavefunction(s) for diagram number 62 - // (none) - - // Amplitude(s) for diagram number 62 - FFV1_0( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 63 OF 72 *** - - // Wavefunction(s) for diagram number 63 - FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); - - // Amplitude(s) for diagram number 63 - FFV1_0( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 64 OF 72 *** - - // Wavefunction(s) for diagram number 64 - // (none) - - // Amplitude(s) for diagram number 64 - FFV1_0( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 65 OF 72 *** - - // Wavefunction(s) for diagram number 65 - // (none) - - // Amplitude(s) for diagram number 65 - FFV1_0( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 66 OF 72 *** - - // Wavefunction(s) for diagram number 66 - // (none) - - // Amplitude(s) for diagram number 66 - FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 67 OF 72 *** - - // Wavefunction(s) for diagram number 67 - // (none) - - // Amplitude(s) for diagram number 67 - VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - VVVV9_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - VVVV10_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 68 OF 72 *** - - // Wavefunction(s) for diagram number 68 - // (none) - - // Amplitude(s) for diagram number 68 - VVV5_0( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 69 OF 72 *** - - // Wavefunction(s) for diagram number 69 - // (none) - - // Amplitude(s) for diagram number 69 - VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 70 OF 72 *** - - // Wavefunction(s) for diagram number 70 - // (none) - - // Amplitude(s) for diagram number 70 - VVVV1_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - jamp_sv[8] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - VVVV9_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - VVVV10_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 71 OF 72 *** - - // Wavefunction(s) for diagram number 71 - // (none) - - // Amplitude(s) for diagram number 71 - VVV5_0( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 72 OF 72 *** - - // Wavefunction(s) for diagram number 72 - // (none) - - // Amplitude(s) for diagram number 72 - VVV5_0( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, - { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, - { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, - { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, - { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, - { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, - { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, - { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, - { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, - { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, - { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, - { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 72 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -1639,7 +636,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1674,6 +675,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1716,6 +721,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1818,26 +827,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1845,25 +854,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -2001,20 +1214,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -2026,17 +1233,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -2062,93 +1272,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2190,7 +1370,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -2213,7 +1393,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -2222,25 +1402,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -2250,8 +1436,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -2267,11 +1455,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -2373,14 +1562,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h index d207c3303f..b147b40b3b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 72; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 76; //static const int ncomb = 64; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f index ef1e17705f..95e59c2089 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f index 2086a21e98..b046c442e3 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc new file mode 100644 index 0000000000..065151d9f1 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc @@ -0,0 +1,393 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, + { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, + { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, + { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, + { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, + { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, + { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, + { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, + { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, + { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, + { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, + { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagram_boilerplate.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagrams.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagrams.h new file mode 100644 index 0000000000..d29bb82ea5 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagrams.h @@ -0,0 +1,2237 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 72 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 ); + VVV5P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 72 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 72 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 3 + VVV5_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 72 *** + // Wavefunction(s) for diagram number 4 + FFV1P0_3( w_fp[5], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[11] ); + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 72 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 72 *** + // Wavefunction(s) for diagram number 6 + FFV1P0_3( w_fp[3], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 6 + VVV5_0( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 72 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 72 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 72 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 72 *** + // Wavefunction(s) for diagram number 10 + // (none) + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 72 *** + // Wavefunction(s) for diagram number 11 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + FFV1P0_3( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 72 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 72 *** + // Wavefunction(s) for diagram number 13 + FFV1_1( w_fp[4], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 72 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 72 *** + // Wavefunction(s) for diagram number 15 + FFV1_2( w_fp[5], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); + // Amplitude(s) for diagram number 15 + FFV1_0( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 72 *** + // Wavefunction(s) for diagram number 16 + // (none) + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 72 *** + // Wavefunction(s) for diagram number 17 + FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 72 *** + // Wavefunction(s) for diagram number 18 + // (none) + // Amplitude(s) for diagram number 18 + VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 72 *** + // Wavefunction(s) for diagram number 19 + // (none) + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 72 *** + // Wavefunction(s) for diagram number 20 + // (none) + // Amplitude(s) for diagram number 20 + VVV5_0( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 72 *** + // Wavefunction(s) for diagram number 21 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1P0_3( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 72 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 72 *** + // Wavefunction(s) for diagram number 23 + FFV1P0_3( w_fp[14], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 23 + FFV1_0( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 72 *** + // Wavefunction(s) for diagram number 24 + // (none) + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 72 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + FFV1_0( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 72 *** + // Wavefunction(s) for diagram number 26 + // (none) + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 72 *** + // Wavefunction(s) for diagram number 27 + FFV1_2( w_fp[14], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 72 *** + // Wavefunction(s) for diagram number 28 + // (none) + // Amplitude(s) for diagram number 28 + VVV5_0( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 72 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 72 *** + // Wavefunction(s) for diagram number 30 + // (none) + // Amplitude(s) for diagram number 30 + VVV5_0( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 72 *** + // Wavefunction(s) for diagram number 31 + FFV1_1( w_fp[4], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); + // Amplitude(s) for diagram number 31 + FFV1_0( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 72 *** + // Wavefunction(s) for diagram number 32 + FFV1P0_3( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 72 *** + // Wavefunction(s) for diagram number 33 + // (none) + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 72 *** + // Wavefunction(s) for diagram number 34 + // (none) + // Amplitude(s) for diagram number 34 + FFV1_0( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 72 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + FFV1_0( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 72 *** + // Wavefunction(s) for diagram number 36 + // (none) + // Amplitude(s) for diagram number 36 + FFV1_0( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram37( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 37 OF 72 *** + // Wavefunction(s) for diagram number 37 + FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 37 + FFV1_0( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram38( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 38 OF 72 *** + // Wavefunction(s) for diagram number 38 + // (none) + // Amplitude(s) for diagram number 38 + VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram39( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 39 OF 72 *** + // Wavefunction(s) for diagram number 39 + // (none) + // Amplitude(s) for diagram number 39 + FFV1_0( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram40( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 40 OF 72 *** + // Wavefunction(s) for diagram number 40 + // (none) + // Amplitude(s) for diagram number 40 + VVV5_0( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram41( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 41 OF 72 *** + // Wavefunction(s) for diagram number 41 + FFV1_2( w_fp[5], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); + FFV1P0_3( w_fp[17], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 41 + FFV1_0( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram42( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 42 OF 72 *** + // Wavefunction(s) for diagram number 42 + // (none) + // Amplitude(s) for diagram number 42 + FFV1_0( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram43( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 43 OF 72 *** + // Wavefunction(s) for diagram number 43 + FFV1P0_3( w_fp[17], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 43 + FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram44( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 44 OF 72 *** + // Wavefunction(s) for diagram number 44 + // (none) + // Amplitude(s) for diagram number 44 + FFV1_0( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram45( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 45 OF 72 *** + // Wavefunction(s) for diagram number 45 + // (none) + // Amplitude(s) for diagram number 45 + FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram46( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 46 OF 72 *** + // Wavefunction(s) for diagram number 46 + // (none) + // Amplitude(s) for diagram number 46 + FFV1_0( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram47( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 47 OF 72 *** + // Wavefunction(s) for diagram number 47 + FFV1_2( w_fp[17], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 47 + FFV1_0( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram48( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 48 OF 72 *** + // Wavefunction(s) for diagram number 48 + // (none) + // Amplitude(s) for diagram number 48 + VVV5_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 48 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram49( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 49 OF 72 *** + // Wavefunction(s) for diagram number 49 + // (none) + // Amplitude(s) for diagram number 49 + FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram50( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 50 OF 72 *** + // Wavefunction(s) for diagram number 50 + // (none) + // Amplitude(s) for diagram number 50 + VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram51( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 51 OF 72 *** + // Wavefunction(s) for diagram number 51 + FFV1_1( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 51 + FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram52( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 52 OF 72 *** + // Wavefunction(s) for diagram number 52 + VVV5P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 52 + FFV1_0( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram53( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 53 OF 72 *** + // Wavefunction(s) for diagram number 53 + // (none) + // Amplitude(s) for diagram number 53 + FFV1_0( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram54( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 54 OF 72 *** + // Wavefunction(s) for diagram number 54 + VVV5P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 54 + FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram55( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 55 OF 72 *** + // Wavefunction(s) for diagram number 55 + FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + // Amplitude(s) for diagram number 55 + FFV1_0( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram56( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 56 OF 72 *** + // Wavefunction(s) for diagram number 56 + VVV5P0_1( w_fp[0], w_fp[11], COUPs[0], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 56 + FFV1_0( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram57( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 57 OF 72 *** + // Wavefunction(s) for diagram number 57 + // (none) + // Amplitude(s) for diagram number 57 + FFV1_0( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram58( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 58 OF 72 *** + // Wavefunction(s) for diagram number 58 + // (none) + // Amplitude(s) for diagram number 58 + FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram59( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 59 OF 72 *** + // Wavefunction(s) for diagram number 59 + FFV1_1( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 59 + FFV1_0( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram60( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 60 OF 72 *** + // Wavefunction(s) for diagram number 60 + VVV5P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[16] ); + // Amplitude(s) for diagram number 60 + FFV1_0( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram61( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 61 OF 72 *** + // Wavefunction(s) for diagram number 61 + // (none) + // Amplitude(s) for diagram number 61 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram62( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 62 OF 72 *** + // Wavefunction(s) for diagram number 62 + // (none) + // Amplitude(s) for diagram number 62 + FFV1_0( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram63( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 63 OF 72 *** + // Wavefunction(s) for diagram number 63 + FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); + // Amplitude(s) for diagram number 63 + FFV1_0( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram64( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 64 OF 72 *** + // Wavefunction(s) for diagram number 64 + // (none) + // Amplitude(s) for diagram number 64 + FFV1_0( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram65( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 65 OF 72 *** + // Wavefunction(s) for diagram number 65 + // (none) + // Amplitude(s) for diagram number 65 + FFV1_0( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram66( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 66 OF 72 *** + // Wavefunction(s) for diagram number 66 + // (none) + // Amplitude(s) for diagram number 66 + FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram67( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 67 OF 72 *** + // Wavefunction(s) for diagram number 67 + // (none) + // Amplitude(s) for diagram number 67 + VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + VVVV9_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + VVVV10_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram68( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 68 OF 72 *** + // Wavefunction(s) for diagram number 68 + // (none) + // Amplitude(s) for diagram number 68 + VVV5_0( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram69( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 69 OF 72 *** + // Wavefunction(s) for diagram number 69 + // (none) + // Amplitude(s) for diagram number 69 + VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram70( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 70 OF 72 *** + // Wavefunction(s) for diagram number 70 + // (none) + // Amplitude(s) for diagram number 70 + VVVV1_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + VVVV9_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + VVVV10_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram71( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 71 OF 72 *** + // Wavefunction(s) for diagram number 71 + // (none) + // Amplitude(s) for diagram number 71 + VVV5_0( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram72( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 72 OF 72 *** + // Wavefunction(s) for diagram number 72 + // (none) + // Amplitude(s) for diagram number 72 + VVV5_0( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f index f7f23196eb..fc9392238e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f index 45032ad41c..a833594d67 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -398,7 +398,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(34) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -441,111 +442,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 6) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D - $ +00,0.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 1),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 12) /48,32,32,12,0,32,-4,0,-12,-4,-4,12/ C 1 T(1,2,3,4) T(5,6) - DATA (CF(I, 2),I= 1, 6) /5.333333333333333D+00 - $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,0.000000000000000D+00/ - DATA (CF(I, 2),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,-4,-12,12,-4/ C 1 T(1,2,3,6) T(5,4) - DATA (CF(I, 3),I= 1, 6) /5.333333333333333D+00 - $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ - DATA (CF(I, 3),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01/ + DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,12,-12,-4/ C 1 T(1,2,5,4) T(3,6) - DATA (CF(I, 4),I= 1, 6) /2.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,0.000000000000000D+00,-6.666666666666666D-01/ - DATA (CF(I, 4),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00/ + DATA (CF(I),I= 34, 42) /48,0,-4,32,0,12,-4,-4,-12/ C 1 T(1,2,5,6) T(3,4) - DATA (CF(I, 5),I= 1, 6) /0.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00/ - DATA (CF(I, 5),I= 7, 12) /5.333333333333333D+00 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ + DATA (CF(I),I= 43, 50) /48,32,32,12,0,-4,32,0/ C 1 T(1,3,4) T(2,5,6) - DATA (CF(I, 6),I= 1, 6) /5.333333333333333D+00 - $ ,0.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D - $ +01/ - DATA (CF(I, 6),I= 7, 12) /2.000000000000000D+00 - $ ,5.333333333333333D+00,-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ + DATA (CF(I),I= 51, 57) /48,12,32,-4,0,0,32/ C 1 T(1,3,6) T(2,5,4) - DATA (CF(I, 7),I= 1, 6) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ - DATA (CF(I, 7),I= 7, 12) /1.600000000000000D+01 - $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D - $ +00,0.000000000000000D+00,-6.666666666666666D-01/ + DATA (CF(I),I= 58, 63) /48,32,32,0,0,-4/ C 1 T(1,5,4) T(2,3,6) - DATA (CF(I, 8),I= 1, 6) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00,2.000000000000000D+00,5.333333333333333D+00/ - DATA (CF(I, 8),I= 7, 12) /5.333333333333333D+00 - $ ,1.600000000000000D+01,0.000000000000000D+00,5.333333333333333D - $ +00,-6.666666666666666D-01,0.000000000000000D+00/ + DATA (CF(I),I= 64, 68) /48,0,32,-4,0/ C 1 T(1,5,6) T(2,3,4) - DATA (CF(I, 9),I= 1, 6) /-2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,2.000000000000000D+00,0.000000000000000D+00, - $ -6.666666666666666D-01/ - DATA (CF(I, 9),I= 7, 12) /5.333333333333333D+00 - $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D - $ +00,5.333333333333333D+00,2.000000000000000D+00/ + DATA (CF(I),I= 69, 72) /48,32,32,12/ C 1 T(2,1,3,4) T(5,6) - DATA (CF(I, 10),I= 1, 6) /-6.666666666666666D-01, - $ -2.000000000000000D+00,2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01 - $ ,0.000000000000000D+00/ - DATA (CF(I, 10),I= 7, 12) /0.000000000000000D+00 - $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D - $ +01,2.000000000000000D+00,5.333333333333333D+00/ + DATA (CF(I),I= 73, 75) /48,12,32/ C 1 T(2,1,3,6) T(5,4) - DATA (CF(I, 11),I= 1, 6) /-6.666666666666666D-01 - $ ,2.000000000000000D+00,-2.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D - $ +00/ - DATA (CF(I, 11),I= 7, 12) /0.000000000000000D+00, - $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D - $ +00,1.600000000000000D+01,5.333333333333333D+00/ + DATA (CF(I),I= 76, 77) /48,32/ C 1 T(2,1,5,4) T(3,6) - DATA (CF(I, 12),I= 1, 6) /2.000000000000000D+00, - $ -6.666666666666666D-01,-6.666666666666666D-01, - $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D - $ +00/ - DATA (CF(I, 12),I= 7, 12) /-6.666666666666666D-01 - $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D - $ +00,5.333333333333333D+00,1.600000000000000D+01/ + DATA (CF(I),I= 78, 78) /48/ C 1 T(2,1,5,6) T(3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -910,10 +844,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -922,6 +858,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/cluster.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent b/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent +++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h index 98fc59d3ea..d523fcab47 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV5_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV5P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV9_0( const fptype allV1[], const fptype allV2[], @@ -962,7 +962,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV10_0( const fptype allV1[], const fptype allV2[], @@ -975,7 +975,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV5_0( const fptype allV1[], const fptype allV2[], @@ -988,7 +988,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1011,7 +1011,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV5P0_1( const fptype allV2[], const fptype allV3[], @@ -1024,7 +1024,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1049,7 +1049,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1062,7 +1062,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1074,7 +1074,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1087,7 +1087,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1106,7 +1106,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1119,7 +1119,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1138,7 +1138,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1151,7 +1151,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1169,7 +1169,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -1184,7 +1184,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] ); @@ -1199,7 +1199,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV9_0( const fptype allV1[], const fptype allV2[], @@ -1214,7 +1214,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1229,7 +1229,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV10_0( const fptype allV1[], const fptype allV2[], @@ -1244,7 +1244,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc index e394058ac8..eb2e5744ce 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h index 6d053c0d16..3f22a38896 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -657,7 +657,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -667,12 +667,12 @@ namespace mg5amcCpu using namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_7s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 ); - fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); - fptype* GC_8s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 ); - cxtype_sv_ref GC_7s_sv = C_ACCESS::kernelAccess( GC_7s ); - cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); - cxtype_sv_ref GC_8s_sv = C_ACCESS::kernelAccess( GC_8s ); + fptype* GC_7s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 ); + fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); + fptype* GC_8s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 ); + cxtype_sv_ref GC_7s_sv = CD_ACCESS::kernelAccess( GC_7s ); + cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s ); + cxtype_sv_ref GC_8s_sv = CD_ACCESS::kernelAccess( GC_8s ); GC_7s_sv = couplings_sv.GC_7; GC_6s_sv = couplings_sv.GC_6; GC_8s_sv = couplings_sv.GC_8; diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index 5444229389..ad8d58b375 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -73,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.12831377983093262  +DEBUG: model prefixing takes 0.1275167465209961  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -88,33 +88,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.671 s +1 processes with 72 diagrams generated in 3.713 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.186 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.184 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.316 s +ALOHA: aloha creates 5 routines in 0.305 s VVV5 VVV5 FFV1 @@ -124,17 +124,17 @@ ALOHA: aloha creates 5 routines in 0.316 s VVVV1 VVVV9 VVVV10 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m5.073s -user 0m4.975s -sys 0m0.073s +real 0m5.095s +user 0m4.971s +sys 0m0.077s Code generation completed in 5 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h index a4f3a481bb..84c20a1f30 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h index 2f711d8cc1..7f3a4e3dca 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc index 6a64c39915..42eaa96778 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 12; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,1281 +279,277 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 72 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 ); - - VVV5P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); - FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 72 *** - - // Wavefunction(s) for diagram number 2 - FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 72 *** - - // Wavefunction(s) for diagram number 3 - FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); - - // Amplitude(s) for diagram number 3 - VVV5_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 4 OF 72 *** - - // Wavefunction(s) for diagram number 4 - FFV1P0_3( w_fp[5], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[11] ); - FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 4 - FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 5 OF 72 *** - - // Wavefunction(s) for diagram number 5 - // (none) - - // Amplitude(s) for diagram number 5 - FFV1_0( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 6 OF 72 *** - - // Wavefunction(s) for diagram number 6 - FFV1P0_3( w_fp[3], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] ); - - // Amplitude(s) for diagram number 6 - VVV5_0( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - jamp_sv[8] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 7 OF 72 *** - - // Wavefunction(s) for diagram number 7 - FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 7 - FFV1_0( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 8 OF 72 *** - - // Wavefunction(s) for diagram number 8 - // (none) - - // Amplitude(s) for diagram number 8 - FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 9 OF 72 *** - - // Wavefunction(s) for diagram number 9 - // (none) - - // Amplitude(s) for diagram number 9 - FFV1_0( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 10 OF 72 *** - - // Wavefunction(s) for diagram number 10 - // (none) - - // Amplitude(s) for diagram number 10 - FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 11 OF 72 *** - - // Wavefunction(s) for diagram number 11 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - FFV1P0_3( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 11 - FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[5] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 12 OF 72 *** - - // Wavefunction(s) for diagram number 12 - // (none) - - // Amplitude(s) for diagram number 12 - FFV1_0( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[5] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 13 OF 72 *** - - // Wavefunction(s) for diagram number 13 - FFV1_1( w_fp[4], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); - FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 13 - FFV1_0( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[5] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 14 OF 72 *** - - // Wavefunction(s) for diagram number 14 - // (none) - - // Amplitude(s) for diagram number 14 - FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[5] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 15 OF 72 *** - - // Wavefunction(s) for diagram number 15 - FFV1_2( w_fp[5], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); - - // Amplitude(s) for diagram number 15 - FFV1_0( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 16 OF 72 *** - - // Wavefunction(s) for diagram number 16 - // (none) - - // Amplitude(s) for diagram number 16 - FFV1_0( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 17 OF 72 *** - - // Wavefunction(s) for diagram number 17 - FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - - // Amplitude(s) for diagram number 17 - FFV1_0( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[1] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 18 OF 72 *** - - // Wavefunction(s) for diagram number 18 - // (none) - - // Amplitude(s) for diagram number 18 - VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 19 OF 72 *** - - // Wavefunction(s) for diagram number 19 - // (none) - - // Amplitude(s) for diagram number 19 - FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[1] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 20 OF 72 *** - - // Wavefunction(s) for diagram number 20 - // (none) - - // Amplitude(s) for diagram number 20 - VVV5_0( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 21 OF 72 *** - - // Wavefunction(s) for diagram number 21 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - FFV1P0_3( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 21 - FFV1_0( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += 1. / 6. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 22 OF 72 *** - - // Wavefunction(s) for diagram number 22 - // (none) - - // Amplitude(s) for diagram number 22 - FFV1_0( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 23 OF 72 *** - - // Wavefunction(s) for diagram number 23 - FFV1P0_3( w_fp[14], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 23 - FFV1_0( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 24 OF 72 *** - - // Wavefunction(s) for diagram number 24 - // (none) - - // Amplitude(s) for diagram number 24 - FFV1_0( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 25 OF 72 *** - - // Wavefunction(s) for diagram number 25 - // (none) - - // Amplitude(s) for diagram number 25 - FFV1_0( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= 1. / 6. * amp_sv[0]; - jamp_sv[6] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 26 OF 72 *** - - // Wavefunction(s) for diagram number 26 - // (none) - - // Amplitude(s) for diagram number 26 - FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 27 OF 72 *** - - // Wavefunction(s) for diagram number 27 - FFV1_2( w_fp[14], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); - - // Amplitude(s) for diagram number 27 - FFV1_0( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 28 OF 72 *** - - // Wavefunction(s) for diagram number 28 - // (none) - - // Amplitude(s) for diagram number 28 - VVV5_0( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 29 OF 72 *** - - // Wavefunction(s) for diagram number 29 - // (none) - - // Amplitude(s) for diagram number 29 - FFV1_0( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 30 OF 72 *** - - // Wavefunction(s) for diagram number 30 - // (none) - - // Amplitude(s) for diagram number 30 - VVV5_0( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 31 OF 72 *** - - // Wavefunction(s) for diagram number 31 - FFV1_1( w_fp[4], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); - - // Amplitude(s) for diagram number 31 - FFV1_0( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 32 OF 72 *** - - // Wavefunction(s) for diagram number 32 - FFV1P0_3( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 32 - FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 33 OF 72 *** - - // Wavefunction(s) for diagram number 33 - // (none) - - // Amplitude(s) for diagram number 33 - FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 34 OF 72 *** - - // Wavefunction(s) for diagram number 34 - // (none) - - // Amplitude(s) for diagram number 34 - FFV1_0( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 35 OF 72 *** - - // Wavefunction(s) for diagram number 35 - // (none) - - // Amplitude(s) for diagram number 35 - FFV1_0( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 36 OF 72 *** - - // Wavefunction(s) for diagram number 36 - // (none) - - // Amplitude(s) for diagram number 36 - FFV1_0( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - jamp_sv[6] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 37 OF 72 *** - - // Wavefunction(s) for diagram number 37 - FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); - - // Amplitude(s) for diagram number 37 - FFV1_0( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 38 OF 72 *** - - // Wavefunction(s) for diagram number 38 - // (none) - - // Amplitude(s) for diagram number 38 - VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 39 OF 72 *** - - // Wavefunction(s) for diagram number 39 - // (none) - - // Amplitude(s) for diagram number 39 - FFV1_0( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 40 OF 72 *** - - // Wavefunction(s) for diagram number 40 - // (none) - - // Amplitude(s) for diagram number 40 - VVV5_0( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 41 OF 72 *** - - // Wavefunction(s) for diagram number 41 - FFV1_2( w_fp[5], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); - FFV1P0_3( w_fp[17], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 41 - FFV1_0( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 42 OF 72 *** - - // Wavefunction(s) for diagram number 42 - // (none) - - // Amplitude(s) for diagram number 42 - FFV1_0( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 43 OF 72 *** - - // Wavefunction(s) for diagram number 43 - FFV1P0_3( w_fp[17], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 43 - FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += 1. / 6. * amp_sv[0]; - jamp_sv[7] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 44 OF 72 *** - - // Wavefunction(s) for diagram number 44 - // (none) - - // Amplitude(s) for diagram number 44 - FFV1_0( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += 1. / 2. * amp_sv[0]; - jamp_sv[7] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 45 OF 72 *** - - // Wavefunction(s) for diagram number 45 - // (none) - - // Amplitude(s) for diagram number 45 - FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += 1. / 6. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 46 OF 72 *** - - // Wavefunction(s) for diagram number 46 - // (none) - - // Amplitude(s) for diagram number 46 - FFV1_0( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 47 OF 72 *** - - // Wavefunction(s) for diagram number 47 - FFV1_2( w_fp[17], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); - - // Amplitude(s) for diagram number 47 - FFV1_0( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 48 OF 72 *** - - // Wavefunction(s) for diagram number 48 - // (none) - - // Amplitude(s) for diagram number 48 - VVV5_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 49 OF 72 *** - - // Wavefunction(s) for diagram number 49 - // (none) - - // Amplitude(s) for diagram number 49 - FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[9] += 1. / 6. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 50 OF 72 *** - - // Wavefunction(s) for diagram number 50 - // (none) - - // Amplitude(s) for diagram number 50 - VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 51 OF 72 *** - - // Wavefunction(s) for diagram number 51 - FFV1_1( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); - - // Amplitude(s) for diagram number 51 - FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 52 OF 72 *** - - // Wavefunction(s) for diagram number 52 - VVV5P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[12] ); - - // Amplitude(s) for diagram number 52 - FFV1_0( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 53 OF 72 *** - - // Wavefunction(s) for diagram number 53 - // (none) - - // Amplitude(s) for diagram number 53 - FFV1_0( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[8] -= 1. / 6. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 54 OF 72 *** - - // Wavefunction(s) for diagram number 54 - VVV5P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[9] ); - - // Amplitude(s) for diagram number 54 - FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 55 OF 72 *** - - // Wavefunction(s) for diagram number 55 - FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); - - // Amplitude(s) for diagram number 55 - FFV1_0( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[2] += 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 56 OF 72 *** - - // Wavefunction(s) for diagram number 56 - VVV5P0_1( w_fp[0], w_fp[11], COUPs[0], 1.0, 0., 0., w_fp[14] ); - - // Amplitude(s) for diagram number 56 - FFV1_0( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 57 OF 72 *** - - // Wavefunction(s) for diagram number 57 - // (none) - - // Amplitude(s) for diagram number 57 - FFV1_0( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 6. * amp_sv[0]; - jamp_sv[2] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 58 OF 72 *** - - // Wavefunction(s) for diagram number 58 - // (none) - - // Amplitude(s) for diagram number 58 - FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 59 OF 72 *** - - // Wavefunction(s) for diagram number 59 - FFV1_1( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); - - // Amplitude(s) for diagram number 59 - FFV1_0( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 60 OF 72 *** - - // Wavefunction(s) for diagram number 60 - VVV5P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[16] ); - - // Amplitude(s) for diagram number 60 - FFV1_0( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 61 OF 72 *** - - // Wavefunction(s) for diagram number 61 - // (none) - - // Amplitude(s) for diagram number 61 - FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[10] += 1. / 6. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 62 OF 72 *** - - // Wavefunction(s) for diagram number 62 - // (none) - - // Amplitude(s) for diagram number 62 - FFV1_0( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 63 OF 72 *** - - // Wavefunction(s) for diagram number 63 - FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); - - // Amplitude(s) for diagram number 63 - FFV1_0( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[3] -= 1. / 6. * amp_sv[0]; - - // *** DIAGRAM 64 OF 72 *** - - // Wavefunction(s) for diagram number 64 - // (none) - - // Amplitude(s) for diagram number 64 - FFV1_0( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 65 OF 72 *** - - // Wavefunction(s) for diagram number 65 - // (none) - - // Amplitude(s) for diagram number 65 - FFV1_0( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 6. * amp_sv[0]; - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 66 OF 72 *** - - // Wavefunction(s) for diagram number 66 - // (none) - - // Amplitude(s) for diagram number 66 - FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 67 OF 72 *** - - // Wavefunction(s) for diagram number 67 - // (none) - - // Amplitude(s) for diagram number 67 - VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[2] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - VVVV9_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - VVVV10_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 68 OF 72 *** - - // Wavefunction(s) for diagram number 68 - // (none) - - // Amplitude(s) for diagram number 68 - VVV5_0( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[10] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 69 OF 72 *** - - // Wavefunction(s) for diagram number 69 - // (none) - - // Amplitude(s) for diagram number 69 - VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[2] += 1. / 2. * amp_sv[0]; - jamp_sv[5] -= 1. / 2. * amp_sv[0]; - jamp_sv[6] -= 1. / 2. * amp_sv[0]; - jamp_sv[9] += 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 70 OF 72 *** - - // Wavefunction(s) for diagram number 70 - // (none) - - // Amplitude(s) for diagram number 70 - VVVV1_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[3] += 1. / 2. * amp_sv[0]; - jamp_sv[8] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - VVVV9_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - VVVV10_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 71 OF 72 *** - - // Wavefunction(s) for diagram number 71 - // (none) - - // Amplitude(s) for diagram number 71 - VVV5_0( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[11] -= 1. / 2. * amp_sv[0]; - - // *** DIAGRAM 72 OF 72 *** - - // Wavefunction(s) for diagram number 72 - // (none) - - // Amplitude(s) for diagram number 72 - VVV5_0( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[3] -= 1. / 2. * amp_sv[0]; - jamp_sv[4] += 1. / 2. * amp_sv[0]; - jamp_sv[7] += 1. / 2. * amp_sv[0]; - jamp_sv[8] -= 1. / 2. * amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?) - - // The color denominators (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] - - // The color matrix (initialize all array elements, with ncolor=12) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, - { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, - { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, - { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, - { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, - { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, - { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, - { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, - { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, - { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, - { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, - { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------ + // --- CHANNELIDS --- + // ------------------ +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - } - // *** STORE THE RESULTS *** - - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif + + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ + + // *** DIAGRAMS 1 TO 72 *** +#ifdef MGONGPUCPP_GPUIMPL + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); +#else + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -1587,7 +636,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -1622,6 +675,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -1664,6 +721,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -1766,26 +827,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -1793,25 +854,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -1956,13 +1221,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1974,17 +1233,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -2010,93 +1272,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2138,7 +1370,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -2161,7 +1393,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -2170,25 +1402,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -2198,8 +1436,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -2215,11 +1455,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -2321,14 +1562,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h index d207c3303f..b147b40b3b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 72; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 76; //static const int ncomb = 64; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc new file mode 100644 index 0000000000..065151d9f1 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc @@ -0,0 +1,393 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12] + + // The color matrix (initialize all array elements, with ncolor=12) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 }, + { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 }, + { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 }, + { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 }, + { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 }, + { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 }, + { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 }, + { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 }, + { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 }, + { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 }, + { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, + { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagram_boilerplate.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagrams.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagrams.h new file mode 100644 index 0000000000..9b06366348 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagrams.h @@ -0,0 +1,2185 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 72 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 ); + VVV5P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] ); + FFV1_1( w_fp[4], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 72 *** + // Wavefunction(s) for diagram number 2 + FFV1_2( w_fp[5], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 72 *** + // Wavefunction(s) for diagram number 3 + FFV1P0_3( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] ); + // Amplitude(s) for diagram number 3 + VVV5_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 72 *** + // Wavefunction(s) for diagram number 4 + FFV1P0_3( w_fp[5], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[11] ); + FFV1_2( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 4 + FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 72 *** + // Wavefunction(s) for diagram number 5 + // (none) + // Amplitude(s) for diagram number 5 + FFV1_0( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 72 *** + // Wavefunction(s) for diagram number 6 + FFV1P0_3( w_fp[3], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] ); + // Amplitude(s) for diagram number 6 + VVV5_0( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram7( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 7 OF 72 *** + // Wavefunction(s) for diagram number 7 + FFV1_1( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 7 + FFV1_0( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram8( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 8 OF 72 *** + // Wavefunction(s) for diagram number 8 + // (none) + // Amplitude(s) for diagram number 8 + FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram9( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 9 OF 72 *** + // Wavefunction(s) for diagram number 9 + // (none) + // Amplitude(s) for diagram number 9 + FFV1_0( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram10( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 10 OF 72 *** + // Wavefunction(s) for diagram number 10 + // (none) + // Amplitude(s) for diagram number 10 + FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram11( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 11 OF 72 *** + // Wavefunction(s) for diagram number 11 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1_2( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + FFV1P0_3( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 11 + FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram12( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 12 OF 72 *** + // Wavefunction(s) for diagram number 12 + // (none) + // Amplitude(s) for diagram number 12 + FFV1_0( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram13( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 13 OF 72 *** + // Wavefunction(s) for diagram number 13 + FFV1_1( w_fp[4], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); + FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 13 + FFV1_0( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram14( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 14 OF 72 *** + // Wavefunction(s) for diagram number 14 + // (none) + // Amplitude(s) for diagram number 14 + FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram15( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 15 OF 72 *** + // Wavefunction(s) for diagram number 15 + FFV1_2( w_fp[5], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] ); + // Amplitude(s) for diagram number 15 + FFV1_0( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram16( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 16 OF 72 *** + // Wavefunction(s) for diagram number 16 + // (none) + // Amplitude(s) for diagram number 16 + FFV1_0( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram17( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 17 OF 72 *** + // Wavefunction(s) for diagram number 17 + FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + // Amplitude(s) for diagram number 17 + FFV1_0( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram18( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 18 OF 72 *** + // Wavefunction(s) for diagram number 18 + // (none) + // Amplitude(s) for diagram number 18 + VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram19( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 19 OF 72 *** + // Wavefunction(s) for diagram number 19 + // (none) + // Amplitude(s) for diagram number 19 + FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram20( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 20 OF 72 *** + // Wavefunction(s) for diagram number 20 + // (none) + // Amplitude(s) for diagram number 20 + VVV5_0( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram21( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 21 OF 72 *** + // Wavefunction(s) for diagram number 21 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + FFV1_1( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + FFV1P0_3( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 21 + FFV1_0( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram22( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 22 OF 72 *** + // Wavefunction(s) for diagram number 22 + // (none) + // Amplitude(s) for diagram number 22 + FFV1_0( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram23( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 23 OF 72 *** + // Wavefunction(s) for diagram number 23 + FFV1P0_3( w_fp[14], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 23 + FFV1_0( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram24( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 24 OF 72 *** + // Wavefunction(s) for diagram number 24 + // (none) + // Amplitude(s) for diagram number 24 + FFV1_0( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram25( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 25 OF 72 *** + // Wavefunction(s) for diagram number 25 + // (none) + // Amplitude(s) for diagram number 25 + FFV1_0( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram26( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 26 OF 72 *** + // Wavefunction(s) for diagram number 26 + // (none) + // Amplitude(s) for diagram number 26 + FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram27( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 27 OF 72 *** + // Wavefunction(s) for diagram number 27 + FFV1_2( w_fp[14], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); + // Amplitude(s) for diagram number 27 + FFV1_0( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram28( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 28 OF 72 *** + // Wavefunction(s) for diagram number 28 + // (none) + // Amplitude(s) for diagram number 28 + VVV5_0( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram29( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 29 OF 72 *** + // Wavefunction(s) for diagram number 29 + // (none) + // Amplitude(s) for diagram number 29 + FFV1_0( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram30( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 30 OF 72 *** + // Wavefunction(s) for diagram number 30 + // (none) + // Amplitude(s) for diagram number 30 + VVV5_0( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram31( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 31 OF 72 *** + // Wavefunction(s) for diagram number 31 + FFV1_1( w_fp[4], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + FFV1P0_3( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] ); + // Amplitude(s) for diagram number 31 + FFV1_0( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram32( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 32 OF 72 *** + // Wavefunction(s) for diagram number 32 + FFV1P0_3( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 32 + FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram33( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 33 OF 72 *** + // Wavefunction(s) for diagram number 33 + // (none) + // Amplitude(s) for diagram number 33 + FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram34( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 34 OF 72 *** + // Wavefunction(s) for diagram number 34 + // (none) + // Amplitude(s) for diagram number 34 + FFV1_0( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram35( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 35 OF 72 *** + // Wavefunction(s) for diagram number 35 + // (none) + // Amplitude(s) for diagram number 35 + FFV1_0( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram36( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 36 OF 72 *** + // Wavefunction(s) for diagram number 36 + // (none) + // Amplitude(s) for diagram number 36 + FFV1_0( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram37( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 37 OF 72 *** + // Wavefunction(s) for diagram number 37 + FFV1_1( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] ); + // Amplitude(s) for diagram number 37 + FFV1_0( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram38( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 38 OF 72 *** + // Wavefunction(s) for diagram number 38 + // (none) + // Amplitude(s) for diagram number 38 + VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram39( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 39 OF 72 *** + // Wavefunction(s) for diagram number 39 + // (none) + // Amplitude(s) for diagram number 39 + FFV1_0( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram40( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 40 OF 72 *** + // Wavefunction(s) for diagram number 40 + // (none) + // Amplitude(s) for diagram number 40 + VVV5_0( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram41( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 41 OF 72 *** + // Wavefunction(s) for diagram number 41 + FFV1_2( w_fp[5], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] ); + FFV1P0_3( w_fp[17], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 41 + FFV1_0( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram42( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 42 OF 72 *** + // Wavefunction(s) for diagram number 42 + // (none) + // Amplitude(s) for diagram number 42 + FFV1_0( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram43( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 43 OF 72 *** + // Wavefunction(s) for diagram number 43 + FFV1P0_3( w_fp[17], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 43 + FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram44( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 44 OF 72 *** + // Wavefunction(s) for diagram number 44 + // (none) + // Amplitude(s) for diagram number 44 + FFV1_0( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram45( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 45 OF 72 *** + // Wavefunction(s) for diagram number 45 + // (none) + // Amplitude(s) for diagram number 45 + FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram46( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 46 OF 72 *** + // Wavefunction(s) for diagram number 46 + // (none) + // Amplitude(s) for diagram number 46 + FFV1_0( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram47( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 47 OF 72 *** + // Wavefunction(s) for diagram number 47 + FFV1_2( w_fp[17], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] ); + // Amplitude(s) for diagram number 47 + FFV1_0( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram48( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 48 OF 72 *** + // Wavefunction(s) for diagram number 48 + // (none) + // Amplitude(s) for diagram number 48 + VVV5_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram49( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 49 OF 72 *** + // Wavefunction(s) for diagram number 49 + // (none) + // Amplitude(s) for diagram number 49 + FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram50( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 50 OF 72 *** + // Wavefunction(s) for diagram number 50 + // (none) + // Amplitude(s) for diagram number 50 + VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram51( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 51 OF 72 *** + // Wavefunction(s) for diagram number 51 + FFV1_1( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] ); + // Amplitude(s) for diagram number 51 + FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram52( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 52 OF 72 *** + // Wavefunction(s) for diagram number 52 + VVV5P0_1( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[12] ); + // Amplitude(s) for diagram number 52 + FFV1_0( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram53( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 53 OF 72 *** + // Wavefunction(s) for diagram number 53 + // (none) + // Amplitude(s) for diagram number 53 + FFV1_0( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram54( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 54 OF 72 *** + // Wavefunction(s) for diagram number 54 + VVV5P0_1( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[9] ); + // Amplitude(s) for diagram number 54 + FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram55( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 55 OF 72 *** + // Wavefunction(s) for diagram number 55 + FFV1_2( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] ); + // Amplitude(s) for diagram number 55 + FFV1_0( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram56( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 56 OF 72 *** + // Wavefunction(s) for diagram number 56 + VVV5P0_1( w_fp[0], w_fp[11], COUPs[0], 1.0, 0., 0., w_fp[14] ); + // Amplitude(s) for diagram number 56 + FFV1_0( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram57( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 57 OF 72 *** + // Wavefunction(s) for diagram number 57 + // (none) + // Amplitude(s) for diagram number 57 + FFV1_0( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram58( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 58 OF 72 *** + // Wavefunction(s) for diagram number 58 + // (none) + // Amplitude(s) for diagram number 58 + FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram59( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 59 OF 72 *** + // Wavefunction(s) for diagram number 59 + FFV1_1( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] ); + // Amplitude(s) for diagram number 59 + FFV1_0( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram60( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 60 OF 72 *** + // Wavefunction(s) for diagram number 60 + VVV5P0_1( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[16] ); + // Amplitude(s) for diagram number 60 + FFV1_0( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram61( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 61 OF 72 *** + // Wavefunction(s) for diagram number 61 + // (none) + // Amplitude(s) for diagram number 61 + FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram62( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 62 OF 72 *** + // Wavefunction(s) for diagram number 62 + // (none) + // Amplitude(s) for diagram number 62 + FFV1_0( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram63( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 63 OF 72 *** + // Wavefunction(s) for diagram number 63 + FFV1_2( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] ); + // Amplitude(s) for diagram number 63 + FFV1_0( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram64( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 64 OF 72 *** + // Wavefunction(s) for diagram number 64 + // (none) + // Amplitude(s) for diagram number 64 + FFV1_0( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram65( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 65 OF 72 *** + // Wavefunction(s) for diagram number 65 + // (none) + // Amplitude(s) for diagram number 65 + FFV1_0( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram66( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 66 OF 72 *** + // Wavefunction(s) for diagram number 66 + // (none) + // Amplitude(s) for diagram number 66 + FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram67( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 67 OF 72 *** + // Wavefunction(s) for diagram number 67 + // (none) + // Amplitude(s) for diagram number 67 + VVVV1_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + VVVV9_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + VVVV10_0( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram68( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 68 OF 72 *** + // Wavefunction(s) for diagram number 68 + // (none) + // Amplitude(s) for diagram number 68 + VVV5_0( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram69( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 69 OF 72 *** + // Wavefunction(s) for diagram number 69 + // (none) + // Amplitude(s) for diagram number 69 + VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram70( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 70 OF 72 *** + // Wavefunction(s) for diagram number 70 + // (none) + // Amplitude(s) for diagram number 70 + VVVV1_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + VVVV9_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + VVVV10_0( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram71( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 71 OF 72 *** + // Wavefunction(s) for diagram number 71 + // (none) + // Amplitude(s) for diagram number 71 + VVV5_0( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram72( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 72 OF 72 *** + // Wavefunction(s) for diagram number 72 + // (none) + // Amplitude(s) for diagram number 72 + VVV5_0( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h index 98fc59d3ea..d523fcab47 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ INLINE void VVV5_0( const fptype allV1[], const fptype allV2[], @@ -872,7 +872,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV5P0_1( const fptype allV2[], const fptype allV3[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -897,7 +897,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -910,7 +910,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -923,7 +923,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ INLINE void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -936,7 +936,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV9_0( const fptype allV1[], const fptype allV2[], @@ -962,7 +962,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ INLINE void VVVV10_0( const fptype allV1[], const fptype allV2[], @@ -975,7 +975,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6] - template + template __device__ void VVV5_0( const fptype allV1[], const fptype allV2[], @@ -988,7 +988,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) }; @@ -1011,7 +1011,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV5P0_1( const fptype allV2[], const fptype allV3[], @@ -1024,7 +1024,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -1049,7 +1049,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -1062,7 +1062,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -1074,7 +1074,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -1087,7 +1087,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1106,7 +1106,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1119,7 +1119,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; @@ -1138,7 +1138,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6] - template + template __device__ void FFV1P0_3( const fptype allF1[], const fptype allF2[], @@ -1151,7 +1151,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 ); const cxtype cI = cxmake( 0., 1. ); V3[0] = +F1[0] + F2[0]; @@ -1169,7 +1169,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV1_0( const fptype allV1[], const fptype allV2[], @@ -1184,7 +1184,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] ); @@ -1199,7 +1199,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV9_0( const fptype allV1[], const fptype allV2[], @@ -1214,7 +1214,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); @@ -1229,7 +1229,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6] - template + template __device__ void VVVV10_0( const fptype allV1[], const fptype allV2[], @@ -1244,7 +1244,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc index e394058ac8..eb2e5744ce 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h index 6d053c0d16..3f22a38896 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -657,7 +657,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -667,12 +667,12 @@ namespace mg5amcCpu using namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_7s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 ); - fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); - fptype* GC_8s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 ); - cxtype_sv_ref GC_7s_sv = C_ACCESS::kernelAccess( GC_7s ); - cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); - cxtype_sv_ref GC_8s_sv = C_ACCESS::kernelAccess( GC_8s ); + fptype* GC_7s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 ); + fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); + fptype* GC_8s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 ); + cxtype_sv_ref GC_7s_sv = CD_ACCESS::kernelAccess( GC_7s ); + cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s ); + cxtype_sv_ref GC_8s_sv = CD_ACCESS::kernelAccess( GC_8s ); GC_7s_sv = couplings_sv.GC_7; GC_6s_sv = couplings_sv.GC_6; GC_8s_sv = couplings_sv.GC_8; diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 1690ef1273..1aa898b488 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -550,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.124 s +1 processes with 6 diagrams generated in 0.113 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -576,57 +576,57 @@ FileWriter t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1665]  Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s -Wrote files for 16 helas calls in 0.082 s +Wrote files for 16 helas calls in 0.084 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.186 s +ALOHA: aloha creates 3 routines in 0.177 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.184 s +ALOHA: aloha creates 6 routines in 0.176 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses/P1_gg_t1t1x; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses/P1_gg_t1t1x; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f Hunk #2 succeeded at 215 (offset -12 lines). -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m2.996s -user 0m2.690s -sys 0m0.299s +real 0m2.968s +user 0m2.658s +sys 0m0.309s Code generation completed in 3 seconds ************************************************************ * * @@ -640,7 +640,7 @@ Code generation completed in 3 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -648,9 +648,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -670,7 +670,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -678,9 +678,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat index 9025117612..154187e345 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat index 6b82577032..000832aacd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat index b8db871c35..85e1d39035 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt b/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessCouplings.h index 3802fa57c0..26345d4b43 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h index 5bd3053393..0ddc356e1a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc index 1b3601c86b..4067d77373 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,332 +279,145 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 6 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - sxxxxx( momenta, +1, w_fp[2], 2 ); - - sxxxxx( momenta, +1, w_fp[3], 3 ); - - // Amplitude(s) for diagram number 1 - VVSS1_0( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[1] += amp_sv[0]; - VVSS1_0( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &_fp[0] ); - jamp_sv[0] += amp_sv[0]; - - // *** DIAGRAM 2 OF 6 *** - - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 2 - VSS1_0( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 6 *** - - // Wavefunction(s) for diagram number 3 - VSS1_2( w_fp[0], w_fp[2], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 3 - VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] += amp_sv[0]; - - // *** DIAGRAM 4 OF 6 *** - // Wavefunction(s) for diagram number 4 - VSS1_3( w_fp[0], w_fp[2], COUPs[3], -1.0, cIPD[2], cIPD[3], w_fp[4] ); - - // Amplitude(s) for diagram number 4 - VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += amp_sv[0]; - - // *** DIAGRAM 5 OF 6 *** - - // Wavefunction(s) for diagram number 5 - VSS1_3( w_fp[0], w_fp[3], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 5 - VSS1_0( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[1] += amp_sv[0]; - - // *** DIAGRAM 6 OF 6 *** - // Wavefunction(s) for diagram number 6 - VSS1_3( w_fp[0], w_fp[3], COUPs[3], 1.0, cIPD[2], cIPD[3], w_fp[4] ); - - // Amplitude(s) for diagram number 6 - VSS1_0( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] += amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_t1t1x()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 6 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif -#endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -578,7 +444,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -611,6 +481,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_Msu3 ); m_masses.push_back( m_pars->mdl_Msu3 ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_Msu3, (fptype)m_pars->mdl_Wsu3, (fptype)m_pars->mdl_Msu6, (fptype)m_pars->mdl_Wsu6 }; @@ -651,6 +525,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -753,26 +631,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -780,25 +658,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -936,20 +1018,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -961,17 +1037,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -997,93 +1076,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1125,7 +1174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1148,7 +1197,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1157,25 +1206,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1185,8 +1240,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1202,11 +1259,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1308,14 +1366,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h index d48c729c48..704925d121 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_MSSM_SLHA2.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 4; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 6; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 7; //static const int ncomb = 4; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f index 28f44ab169..b5d6d679c1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f index 40fbb596f2..cde448f79e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc new file mode 100644 index 0000000000..b96d73fb5c --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.h new file mode 100644 index 0000000000..37e497fa4b --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.h @@ -0,0 +1,193 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 6 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + sxxxxx( momenta, +1, w_fp[2], 2 ); + sxxxxx( momenta, +1, w_fp[3], 3 ); + // Amplitude(s) for diagram number 1 + VVSS1_0( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + VVSS1_0( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &_fp[0] ); + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 6 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 2 + VSS1_0( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 6 *** + // Wavefunction(s) for diagram number 3 + VSS1_2( w_fp[0], w_fp[2], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 3 + VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 6 *** + // Wavefunction(s) for diagram number 4 + VSS1_3( w_fp[0], w_fp[2], COUPs[3], -1.0, cIPD[2], cIPD[3], w_fp[4] ); + // Amplitude(s) for diagram number 4 + VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 6 *** + // Wavefunction(s) for diagram number 5 + VSS1_3( w_fp[0], w_fp[3], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 5 + VSS1_0( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 6 *** + // Wavefunction(s) for diagram number 6 + VSS1_3( w_fp[0], w_fp[3], COUPs[3], 1.0, cIPD[2], cIPD[3], w_fp[4] ); + // Amplitude(s) for diagram number 6 + VSS1_0( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f index 3fc552a31d..25dc37ef1e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f index 1a1830b77a..60a896f60b 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -295,7 +295,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -338,7 +338,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -383,23 +384,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WSU3.NE.0D0) FK_MDL_WSU3 = SIGN(MAX(ABS(MDL_WSU3), - $ ABS(MDL_MSU3*SMALL_WIDTH_TREATMENT)), MDL_WSU3) - IF(MDL_WSU6.NE.0D0) FK_MDL_WSU6 = SIGN(MAX(ABS(MDL_WSU6), - $ ABS(MDL_MSU6*SMALL_WIDTH_TREATMENT)), MDL_WSU6) + FK_ZERO = 0D0 + IF(MDL_WSU3.NE.0D0) THEN + FK_MDL_WSU3 = SIGN(MAX(ABS(MDL_WSU3), ABS(MDL_MSU3 + $ *SMALL_WIDTH_TREATMENT)), MDL_WSU3) + ELSE + FK_MDL_WSU3 = 0D0 + ENDIF + + IF(MDL_WSU6.NE.0D0) THEN + FK_MDL_WSU6 = SIGN(MAX(ABS(MDL_WSU6), ABS(MDL_MSU6 + $ *SMALL_WIDTH_TREATMENT)), MDL_WSU6) + ELSE + FK_MDL_WSU6 = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -451,10 +460,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -463,6 +474,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(2)=AMP2(2)+AMP(3)*DCONJG(AMP(3)) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/cluster.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent b/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent +++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h index ec627d7759..be48b2942a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -873,7 +873,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], S2[6], S3[6] - template + template __device__ INLINE void VSS1_0( const fptype allV1[], const fptype allS2[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'S2[6]' from the input wavefunctions V1[6], S3[6] - template + template __device__ INLINE void VSS1_2( const fptype allV1[], const fptype allS3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], S2[6] - template + template __device__ INLINE void VSS1_3( const fptype allV1[], const fptype allS2[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], S3[6], S4[6] - template + template __device__ INLINE void VVSS1_0( const fptype allV1[], const fptype allV2[], @@ -924,7 +924,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -937,7 +937,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -962,7 +962,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], S2[6], S3[6] - template + template __device__ void VSS1_0( const fptype allV1[], const fptype allS2[], @@ -975,7 +975,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* S2 = W_ACCESS::kernelAccessConst( allS2 ); const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( S2[0] ), +cxreal( S2[1] ), +cximag( S2[1] ), +cximag( S2[0] ) }; @@ -990,7 +990,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'S2[6]' from the input wavefunctions V1[6], S3[6] - template + template __device__ void VSS1_2( const fptype allV1[], const fptype allS3[], @@ -1003,7 +1003,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* S2 = W_ACCESS::kernelAccess( allS2 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P3[4] = { +cxreal( S3[0] ), +cxreal( S3[1] ), +cximag( S3[1] ), +cximag( S3[0] ) }; @@ -1021,7 +1021,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], S2[6] - template + template __device__ void VSS1_3( const fptype allV1[], const fptype allS2[], @@ -1034,7 +1034,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* S2 = W_ACCESS::kernelAccessConst( allS2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* S3 = W_ACCESS::kernelAccess( allS3 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( S2[0] ), +cxreal( S2[1] ), +cximag( S2[1] ), +cximag( S2[0] ) }; @@ -1052,7 +1052,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], S3[6], S4[6] - template + template __device__ void VVSS1_0( const fptype allV1[], const fptype allV2[], @@ -1067,7 +1067,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 ); const cxtype_sv* S4 = W_ACCESS::kernelAccessConst( allS4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP7 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc index d596fdf1ec..232fd37777 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h index 26a532156c..faf4bea26d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -903,7 +903,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -913,14 +913,14 @@ namespace mg5amcCpu using namespace Parameters_MSSM_SLHA2_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_90s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_90 ); - fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); - fptype* GC_55s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_55 ); - fptype* GC_57s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_57 ); - cxtype_sv_ref GC_90s_sv = C_ACCESS::kernelAccess( GC_90s ); - cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); - cxtype_sv_ref GC_55s_sv = C_ACCESS::kernelAccess( GC_55s ); - cxtype_sv_ref GC_57s_sv = C_ACCESS::kernelAccess( GC_57s ); + fptype* GC_90s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_90 ); + fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); + fptype* GC_55s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_55 ); + fptype* GC_57s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_57 ); + cxtype_sv_ref GC_90s_sv = CD_ACCESS::kernelAccess( GC_90s ); + cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s ); + cxtype_sv_ref GC_55s_sv = CD_ACCESS::kernelAccess( GC_55s ); + cxtype_sv_ref GC_57s_sv = CD_ACCESS::kernelAccess( GC_57s ); GC_90s_sv = couplings_sv.GC_90; GC_6s_sv = couplings_sv.GC_6; GC_55s_sv = couplings_sv.GC_55; diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 45c009959b..e4054b7ac8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -550,47 +550,47 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.122 s +1 processes with 6 diagrams generated in 0.114 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.182 s +ALOHA: aloha creates 3 routines in 0.178 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.324s -user 0m1.250s -sys 0m0.065s +real 0m1.300s +user 0m1.221s +sys 0m0.067s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessCouplings.h index 3802fa57c0..26345d4b43 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h index 5bd3053393..0ddc356e1a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc index 1d53b4a535..4067d77373 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,333 +279,145 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); -#endif - - // *** DIAGRAM 1 OF 6 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - sxxxxx( momenta, +1, w_fp[2], 2 ); - - sxxxxx( momenta, +1, w_fp[3], 3 ); - - // Amplitude(s) for diagram number 1 - VVSS1_0( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - VVSS1_0( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] += amp_sv[0]; - - // *** DIAGRAM 2 OF 6 *** - - // Wavefunction(s) for diagram number 2 - VVV1P0_1( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 2 - VSS1_0( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 3 OF 6 *** - - // Wavefunction(s) for diagram number 3 - VSS1_2( w_fp[0], w_fp[2], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 3 - VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - jamp_sv[0] += amp_sv[0]; - - // *** DIAGRAM 4 OF 6 *** - - // Wavefunction(s) for diagram number 4 - VSS1_3( w_fp[0], w_fp[2], COUPs[3], -1.0, cIPD[2], cIPD[3], w_fp[4] ); - // Amplitude(s) for diagram number 4 - VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += amp_sv[0]; - - // *** DIAGRAM 5 OF 6 *** - - // Wavefunction(s) for diagram number 5 - VSS1_3( w_fp[0], w_fp[3], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 5 - VSS1_0( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[1] += amp_sv[0]; - - // *** DIAGRAM 6 OF 6 *** - // Wavefunction(s) for diagram number 6 - VSS1_3( w_fp[0], w_fp[3], COUPs[3], 1.0, cIPD[2], cIPD[3], w_fp[4] ); - - // Amplitude(s) for diagram number 6 - VSS1_0( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] += amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_t1t1x()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 6 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -579,7 +444,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -612,6 +481,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_Msu3 ); m_masses.push_back( m_pars->mdl_Msu3 ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_Msu3, (fptype)m_pars->mdl_Wsu3, (fptype)m_pars->mdl_Msu6, (fptype)m_pars->mdl_Wsu6 }; @@ -652,6 +525,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -754,26 +631,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -781,25 +658,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -944,13 +1025,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -962,17 +1037,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -998,93 +1076,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; - } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); - for( int ighel = 0; ighel < cNGoodHel; ighel++ ) - { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1126,7 +1174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1149,7 +1197,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1158,25 +1206,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1186,8 +1240,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1203,11 +1259,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1309,14 +1366,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h index d48c729c48..704925d121 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_MSSM_SLHA2.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 4; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 6; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 7; //static const int ncomb = 4; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc new file mode 100644 index 0000000000..b96d73fb5c --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagrams.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagrams.h new file mode 100644 index 0000000000..9b25d7def9 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagrams.h @@ -0,0 +1,194 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 6 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + sxxxxx( momenta, +1, w_fp[2], 2 ); + sxxxxx( momenta, +1, w_fp[3], 3 ); + // Amplitude(s) for diagram number 1 + VVSS1_0( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + VVSS1_0( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 6 *** + // Wavefunction(s) for diagram number 2 + VVV1P0_1( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 2 + VSS1_0( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 6 *** + // Wavefunction(s) for diagram number 3 + VSS1_2( w_fp[0], w_fp[2], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 3 + VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram4( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 4 OF 6 *** + // Wavefunction(s) for diagram number 4 + VSS1_3( w_fp[0], w_fp[2], COUPs[3], -1.0, cIPD[2], cIPD[3], w_fp[4] ); + // Amplitude(s) for diagram number 4 + VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram5( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 5 OF 6 *** + // Wavefunction(s) for diagram number 5 + VSS1_3( w_fp[0], w_fp[3], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 5 + VSS1_0( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram6( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 6 OF 6 *** + // Wavefunction(s) for diagram number 6 + VSS1_3( w_fp[0], w_fp[3], COUPs[3], 1.0, cIPD[2], cIPD[3], w_fp[4] ); + // Amplitude(s) for diagram number 6 + VSS1_0( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h index ec627d7759..be48b2942a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -873,7 +873,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], S2[6], S3[6] - template + template __device__ INLINE void VSS1_0( const fptype allV1[], const fptype allS2[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'S2[6]' from the input wavefunctions V1[6], S3[6] - template + template __device__ INLINE void VSS1_2( const fptype allV1[], const fptype allS3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], S2[6] - template + template __device__ INLINE void VSS1_3( const fptype allV1[], const fptype allS2[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], S3[6], S4[6] - template + template __device__ INLINE void VVSS1_0( const fptype allV1[], const fptype allV2[], @@ -924,7 +924,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -937,7 +937,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -962,7 +962,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], S2[6], S3[6] - template + template __device__ void VSS1_0( const fptype allV1[], const fptype allS2[], @@ -975,7 +975,7 @@ namespace mg5amcCpu const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* S2 = W_ACCESS::kernelAccessConst( allS2 ); const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( S2[0] ), +cxreal( S2[1] ), +cximag( S2[1] ), +cximag( S2[0] ) }; @@ -990,7 +990,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'S2[6]' from the input wavefunctions V1[6], S3[6] - template + template __device__ void VSS1_2( const fptype allV1[], const fptype allS3[], @@ -1003,7 +1003,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* S2 = W_ACCESS::kernelAccess( allS2 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P3[4] = { +cxreal( S3[0] ), +cxreal( S3[1] ), +cximag( S3[1] ), +cximag( S3[0] ) }; @@ -1021,7 +1021,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], S2[6] - template + template __device__ void VSS1_3( const fptype allV1[], const fptype allS2[], @@ -1034,7 +1034,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 ); const cxtype_sv* S2 = W_ACCESS::kernelAccessConst( allS2 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* S3 = W_ACCESS::kernelAccess( allS3 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( S2[0] ), +cxreal( S2[1] ), +cximag( S2[1] ), +cximag( S2[0] ) }; @@ -1052,7 +1052,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], S3[6], S4[6] - template + template __device__ void VVSS1_0( const fptype allV1[], const fptype allV2[], @@ -1067,7 +1067,7 @@ namespace mg5amcCpu const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 ); const cxtype_sv* S4 = W_ACCESS::kernelAccessConst( allS4 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP7 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc index d596fdf1ec..232fd37777 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h index 26a532156c..faf4bea26d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -903,7 +903,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -913,14 +913,14 @@ namespace mg5amcCpu using namespace Parameters_MSSM_SLHA2_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_90s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_90 ); - fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); - fptype* GC_55s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_55 ); - fptype* GC_57s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_57 ); - cxtype_sv_ref GC_90s_sv = C_ACCESS::kernelAccess( GC_90s ); - cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); - cxtype_sv_ref GC_55s_sv = C_ACCESS::kernelAccess( GC_55s ); - cxtype_sv_ref GC_57s_sv = C_ACCESS::kernelAccess( GC_57s ); + fptype* GC_90s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_90 ); + fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); + fptype* GC_55s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_55 ); + fptype* GC_57s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_57 ); + cxtype_sv_ref GC_90s_sv = CD_ACCESS::kernelAccess( GC_90s ); + cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s ); + cxtype_sv_ref GC_55s_sv = CD_ACCESS::kernelAccess( GC_55s ); + cxtype_sv_ref GC_57s_sv = CD_ACCESS::kernelAccess( GC_57s ); GC_90s_sv = couplings_sv.GC_90; GC_6s_sv = couplings_sv.GC_6; GC_55s_sv = couplings_sv.GC_55; diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 9e7dad46ce..f293ba7e7c 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -550,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.118 s +1 processes with 3 diagrams generated in 0.109 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4334]  +DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4166]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -576,53 +576,53 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1527]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1551]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1552]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1640]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1664]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1665]  Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s Wrote files for 10 helas calls in 0.076 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.137 s +ALOHA: aloha creates 2 routines in 0.132 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.133 s +ALOHA: aloha creates 4 routines in 0.129 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file SubProcesses/makefile -DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 +DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 263]  -Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +DEBUG: p.returncode =  0 [output.py at line 268]  +Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m2.854s -user 0m2.558s -sys 0m0.284s +real 0m3.019s +user 0m2.541s +sys 0m0.329s Code generation completed in 3 seconds ************************************************************ * * @@ -636,7 +636,7 @@ Code generation completed in 3 seconds * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -644,9 +644,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt @@ -666,7 +666,7 @@ launch in debug mode * * * * * * * * * * * * -* VERSION 3.6.0 * +* VERSION 3.6.3 * * * * The MadGraph5_aMC@NLO Development Team - Find us at * * https://server06.fynu.ucl.ac.be/projects/madgraph * @@ -674,9 +674,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 68b4c46295..07d8d59d1b 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo +#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat index 25f63a3016..265ec11c03 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat @@ -8,7 +8,7 @@ #* * * * #* * #* * -#* VERSION 3.6.0 2024-09-30 * +#* VERSION 3.6.3 2025-06-12 * #* * #* WARNING: UNKNOWN DEVELOPMENT VERSION. * #* WARNING: DO NOT USE FOR PRODUCTION * diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat index 6b82577032..000832aacd 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat index b8db871c35..85e1d39035 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat @@ -107,6 +107,7 @@ # Parton level cuts definition * #******************************* 0.0 = dsqrt_shat ! minimal shat for full process + -1 = dsqrt_shatmax ! maximum shat for full process # # #********************************************************************* diff --git a/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts index de3864242b..56ba259c56 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts @@ -102,6 +102,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf + alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -113,10 +114,11 @@ ifneq ($(lhapdf),) endif else alfas_functions=alfas_functions + alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif # Helper function to check MG5 version define CHECK_MG5AMC_VERSION python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;' -endef \ No newline at end of file +endef diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f b/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f index bb69a6384e..84aeff369c 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f @@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF) A_OUT=A_IN/(1D0+A_IN*B0(NF)*T) IF (NLOOP .EQ. 1) RETURN + if (1D0+A_IN*B0(NF)*T.le.0d0)THEN + A_OUT = 9d98 + RETURN + ENDIF A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T)) IF (A_OUT .LT. 0D0) AS=0.3D0 30 AS=A_OUT diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc index 23d099e5f7..a8ccc7420d 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc @@ -37,7 +37,7 @@ C REAL*8 misset,missetmax,ptheavy REAL*8 ptllmin,ptllmax integer maxjetflavor - REAl*8 dsqrt_shat + REAl*8 dsqrt_shat,dsqrt_shatmax COMMON /to_min_max_cuts/ & PTJmax,PTBmax,PTAmax,PTLmax, @@ -60,7 +60,7 @@ C & ht2max,ht3max,ht4max, & htjmin,htjmax,ihtmin,ihtmax, & misset,missetmax,ptheavy, - & ptllmin,ptllmax,dsqrt_shat, + & ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax, & maxjetflavor C diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts index e4b87ee6ad..f10336e42e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts @@ -103,6 +103,7 @@ endif ifneq ($(lhapdf),) CXXFLAGS += $(shell $(lhapdf) --cppflags) alfas_functions=alfas_functions_lhapdf +alfas_to_clean=alfas_functions.o llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF # check if we need to activate c++11 (for lhapdf6.2) ifeq ($(origin CXX),default) @@ -114,6 +115,7 @@ endif endif else alfas_functions=alfas_functions +alfas_to_clean=alfas_functions_lhapdf.o llhapdf= endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/makefile b/epochX/cudacpp/susy_gg_tt.mad/Source/makefile index 291ca907ee..87a9e61723 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/makefile +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/makefile @@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI $(LIBDIR)libdsample.$(libext): $(DSAMPLE) $(call CREATELIB, $@, $^) $(LIBDIR)libgeneric.$(libext): $(GENERIC) + rm -f $@ 2>/dev/null $(call CREATELIB, $@, $^) + rm -f $(alfas_to_clean) 2>/dev/null $(LIBDIR)libdhelas.$(libext): DHELAS cd DHELAS; make; cd .. -$(LIBDIR)libpdf.$(libext): PDF make_opts +$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o cd PDF; make; cd .. ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2))) $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC @@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP # Dependencies dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc +pawgraph.o: vector.inc DiscreteSampler.o: StringCast.o invarients.o: invarients.f genps.inc gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc index 1a1bc782bd..8bd5f73840 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc @@ -88,6 +88,8 @@ DSQRT_SHAT = 0.000000000000000D+00 + DSQRT_SHATMAX = -1 + LIMHEL = 0.000000000000000D+00 PTJ = 2.000000000000000D+01 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt index 084e244cea..1ac53bb4bd 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt @@ -1 +1 @@ -3.6.0 \ No newline at end of file +3.6.3 \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessCouplings.h index 3802fa57c0..26345d4b43 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h index 5bd3053393..0ddc356e1a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 5c62f1bfad..d79ea62148 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,297 +279,139 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // *** DIAGRAM 1 OF 3 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 3 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= amp_sv[0]; - // *** DIAGRAM 3 OF 3 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 3 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -555,7 +450,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -588,6 +487,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -628,6 +531,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -730,26 +637,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -757,25 +664,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -913,20 +1024,14 @@ namespace mg5amcCpu // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) constexpr int nprocesses = 1; static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); - constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter + constexpr int process_id = 1; // code generation source: standalone_cudacpp static_assert( process_id == 1, "Assume process_id == 1" ); } // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -938,17 +1043,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -974,93 +1082,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1102,7 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1125,7 +1203,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1134,25 +1212,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1162,8 +1246,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1179,11 +1265,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1285,14 +1372,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 24c27005b8..5acfd9f387 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_MSSM_SLHA2.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 3; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f index bc9bcfeb9b..ce175a75a8 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f @@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f index db3c284caa..bd3d520785 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -1,7 +1,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE) ENDDO QSCALE=QSCALE/2D0 ELSE - QSCALE=DSQRT(Q2FACT(IB(1))) + QSCALE=DSQRT(Q2FACT(1)) ENDIF G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN - QSCALE=DSQRT(Q2FACT(IB(2))) + QSCALE=DSQRT(Q2FACT(2)) ENDIF G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE) ENDIF @@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, $ ICONF_VEC, IMIRROR_VEC, VECSIZE_USED) C **************************************************** C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, INTEGER I_EE C +C STUFF FOR UPC +C + DOUBLE PRECISION PHOTONPDFSQUARE +C C EXTERNAL FUNCTIONS C LOGICAL PASSCUTS @@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, IF (ABS(LPP(IB(1))).GE.1) THEN C LP=SIGN(1,LPP(IB(1))) G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(1), IVEC))) + $ ,DSQRT(ALL_Q2FACT(1, IVEC))) ENDIF IF (ABS(LPP(IB(2))).GE.1) THEN C LP=SIGN(1,LPP(IB(2))) G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC) - $ ,DSQRT(ALL_Q2FACT(IB(2), IVEC))) + $ ,DSQRT(ALL_Q2FACT(2, IVEC))) ENDIF ENDDO ! IWARP LOOP ENDDO ! CURRWARP LOOP @@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT, C Generate events only if IMODE is 0. IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN C Call UNWGT to unweight and store events + ICONFIG = CHANNELS(IVEC) CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1, $ SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC) ENDIF diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b96d73fb5c --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h new file mode 100644 index 0000000000..e0cad8ec2b --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h @@ -0,0 +1,109 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 3 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 3 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 3 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); + if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f index ec5722702a..c08048ad0e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f @@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened) fopened=.false. tempname=filename fine=index(tempname,' ') -c fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)" if(fine.eq.0) fine=len(tempname) open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f index c9610a83ed..d79945f299 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -1,7 +1,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, $ ICOL) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL, REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C -C Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +C Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 C By the MadGraph5_aMC@NLO Development Team C Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch C @@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C INTEGER I,J,M,N COMPLEX*16 ZTEMP, TMP_JAMP(0) - REAL*8 CF(NCOLOR,NCOLOR) + INTEGER CF(NCOLOR*(NCOLOR+1)/2) + INTEGER DENOM, CF_INDEX COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO) COMPLEX*16 W(6,NWAVEFUNCS) C Needed for v4 models @@ -393,21 +394,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) C C COLOR DATA C - DATA (CF(I, 1),I= 1, 2) /5.333333333333333D+00, - $ -6.666666666666666D-01/ + DATA DENOM/3/ + DATA (CF(I),I= 1, 2) /16,-4/ C 1 T(1,2,3,4) - DATA (CF(I, 2),I= 1, 2) /-6.666666666666666D-01 - $ ,5.333333333333333D+00/ + DATA (CF(I),I= 3, 3) /16/ C 1 T(2,1,3,4) C ---------- C BEGIN CODE C ---------- IF (FIRST) THEN FIRST=.FALSE. - IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO - $ *SMALL_WIDTH_TREATMENT)), ZERO) - IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT - $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + FK_ZERO = 0D0 + IF(MDL_WT.NE.0D0) THEN + FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT + $ *SMALL_WIDTH_TREATMENT)), MDL_WT) + ELSE + FK_MDL_WT = 0D0 + ENDIF + IF(INIT_MODE) THEN ZEROAMP_1(:,:) = .TRUE. @@ -446,10 +450,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) MATRIX1 = 0.D0 DO M = 1, NAMPSO + CF_INDEX = 0 DO I = 1, NCOLOR ZTEMP = (0.D0,0.D0) - DO J = 1, NCOLOR - ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M) + DO J = I, NCOLOR + CF_INDEX = CF_INDEX + 1 + ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M) ENDDO DO N = 1, NAMPSO @@ -458,6 +464,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC) ENDDO ENDDO ENDDO + MATRIX1 = MATRIX1/DENOM IF(SDE_STRAT.EQ.1)THEN AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1)) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f index 9a31ed201d..6f32477b9e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f @@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, integer icol ! color selected integer isym(nexternal,99), jsym - integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic + integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic integer mo_color,da_color(2),itmp integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg integer icolalt(2,-nexternal+2:2*nexternal-3) @@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff, endif lconfig = vec_igraph1(ivec) endif - + is_LC=.true. + maxcolor=0 c c Choose a color flow which is certain to work with the propagator c structure of the chosen diagram and use that as an alternative c if (icol.eq.0) then do i=1,nexternal - icolalt(1,i)=0 + icolalt(1,i)=0 icolalt(2,i)=0 enddo else diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f index b8995283ed..907894ea89 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f @@ -556,6 +556,8 @@ logical function cluster(p, ivec) jwin = 0 cluster=.false. clustered=.false. + iwin =0 + jwin =0 do i=0,3 pcmsp(i)=0 enddo @@ -665,8 +667,11 @@ logical function cluster(p, ivec) c initialize graph storage igraphs(0)=0 nleft=nexternal -c cluster - if (iwin.eq.0.or.jwin.eq.0) stop 21 + if(iwin.eq.0.or.jwin.eq.0)then + cluster=.false. + return + endif +c cluster do n=1,nexternal-2 c combine winner imocl(n)=imap(iwin,2)+imap(jwin,2) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f index 7898714201..bd50ab1357 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f @@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED) c c Limit S_hat c - if (dsqrt_shat.ne.0d0)then - if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then - passcuts=.false. - return - endif - endif + if(nincoming.eq.2) then + if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then + xvar = sumdot(p(0,1),p(0,2),1d0) + if (xvar .lt. dsqrt_shat**2)then + passcuts=.false. + return + else if (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then + passcuts = .false. + return + endif + endif + endif C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight if(debug) write (*,*) '=============================' diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f index 1c32e93f5d..8503bdbec8 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f @@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) double precision smin,smax,spole,swidth,s,jac double precision x logical pass + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' c c Local c @@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass) c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1d0)then + smax = min(smax, dsqrt_shatmax**2) + endif + pass=.true. if (jac .eq. 0 .and. .not. warned0) then print*,'Input jacobian 0 in genps' @@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) DOUBLE PRECISION ETA,ETAMIN,ETAMAX logical warned data warned/.false./ - + include 'maxparticles.inc' + include '../../Source/vector.inc' + include 'run.inc' + include 'cuts.inc' C------------ C BEGIN CODE C------------ @@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI) C IF THERE IS NO S CHANNEL POLE USE BELOW: TAUMIN = 0d0 !SMIN/S !keep scale fix - TAUMAX = 1D0 + if (dsqrt_shatmax.ne.-1d0)then + TAUMAX=dsqrt_shatmax**2/S + else + TAUMAX = 1D0 + endif TAU = (TAUMAX-TAUMIN)*X(1)+TAUMIN SJACOBI= sjacobi*(TAUMAX-TAUMIN) @@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config) if(sde_strat.eq.2)then t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) - get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2 + get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2 endif c write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut t = t/stot @@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config) t = dot(ptemp(0,-i), ptemp(0,-i)) Mass = prmass(-i, config) Width = prwidth(-i, config) - tmp = (t-Mass)*(t+Mass) + tmp = (t-Mass**2) tmp2 = Mass*Width - get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 + get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) endif c write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f index 9e5f8d44dd..5360566ef4 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f @@ -231,6 +231,7 @@ subroutine set_peaks double precision x1,x2,xk(nexternal) double precision dr,mtot,etot,xqfact double precision spmass + double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one integer i, iconfig, l1, l2, j, nt, nbw, iproc, k integer iden_part(-nexternal+1:nexternal) @@ -285,8 +286,8 @@ subroutine set_peaks integer lbw(0:nexternal) !Use of B.W. common /to_BW/ lbw - double precision stot,m1,m2 - common/to_stot/stot,m1,m2 + double precision real_stot,m1,m2 + common/to_stot/real_stot,m1,m2 include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc) include 'cuts.inc' @@ -309,6 +310,12 @@ subroutine set_peaks c----- c Begin Code c----- + if (dsqrt_shatmax.ne.-1)then + stot = min(real_stot, dsqrt_shatmax**2) + else + stot = real_stot + endif + iconfig = this_config c needs to be initialise to avoid segfault do i = -nexternal,-1 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f index 0a0bafa7c1..189ba41449 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f @@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec) if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2) else if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2) - if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1) + if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2) endif elseif(jcentral(1).eq.0)then if(.not.fixed_fac_scale1) q2fact(1) = scalefact**2*pt2ijcl(jfirst(1)) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f index 309540a0a2..2afd9e9f75 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f @@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, c write(*,*) 'mapping',ic,mapconfig(i),icode if (icode .eq. 0) then c Create format string based on number of digits - write(formstr,'(a,i1,a)') '(I',nconf,'$)' + write(formstr,'(a,i1,a)') '(I',nconf,',$)' write(*,formstr) mapconfig(i) c Write symmetry factors write(formstr2,'(a,i2,a)') '(2i',nsym,')' @@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode if(nconf+ncode+1.lt.10) then write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' else write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1, - $ '.',ncode,'$)' + $ '.',ncode,',$)' endif write(*,formstr) dconfig c Write symmetry factors @@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest, dconfig=mapconfig(i)+icode*1d0/10**ncode write(27,formstr2) dconfig,use_config(i) endif - write(*,'(a$)') ' ' + write(*,'(a,$)') ' ' 100 call bw_increment_array(iarray,imax,ibase,done) enddo else diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f index f602511c94..d1247f1849 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f @@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer ip, np, ic, nc integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop integer iseed + double precision beam_mass double precision pboost(0:3) double precision beta, get_betaz double precision ebi(0:3), ebo(0:3) @@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) integer idup(nexternal,maxproc,maxsproc) integer mothup(2,nexternal) integer icolup(2,nexternal,maxflow,maxsproc) - + double precision eta integer nsym integer ievent @@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) if (nincoming.eq.2) then if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and. $ xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then - if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then - ! construct the beam momenta in each frame and compute the related (z)boost - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then - local_mass = 0d0 - else - local_mass = m1 - endif + if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then + if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then + beam_mass = pmass(1) + else + beam_mass = m1 + endif ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(1) ebo(1) = 0 ebo(2) = 0 - ebo(3) = DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(1).eq.1d0) then pb(0,isym(1,jsym)) = ebo(0) @@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo else - if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then - local_mass = 0d0 - else - local_mass = m2 - endif - ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam + if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then + beam_mass = pmass(2) + else + beam_mass = m2 + endif ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam ebi(1) = 0 ebi(2) = 0 - ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2) + ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2) ebo(0) = ebeam(2) ebo(1) = 0 ebo(2) = 0 - ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2) + ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2) beta = get_betaz(ebi, ebo) if (xbk(2).eq.1d0) then pb(0,isym(2,jsym)) = ebo(0) @@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) write(*,*) 'Warning bad x1 or x2 in write_leshouche', $ xbk(1),xbk(2) endif + do j=1,nexternal + call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym))) + pb(4,isym(j,jsym))=pmass(j) + enddo + + ! check for numerical_accuracy + if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then + ! go back to old method --more accurate when boosting with xbk close to one-- + eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2))) + pboost(0)=p(0,1)*(eta + 1d0/eta) + pboost(3)=p(0,1)*(eta - 1d0/eta) + do j=1,nexternal + call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) + enddo + endif else do j=1,nexternal call boostx(p(0,j),pboost,pb(0,isym(j,jsym))) @@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec) enddo endif + + if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then c Note that in this context isym(1,jsym) should never be "2" since the mass differ pb(4,isym(1,jsym))=pmass(2) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun index 8c8f7d3940..01d4ab53f5 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun @@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface try: - cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2]) + cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4]) except KeyboardInterrupt: print('Quit on KeyboardInterrupt') diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh index 20adf572c2..2d149f96be 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh @@ -14,6 +14,18 @@ # USAGE : run [num_events] [iseed] ## ############################################################################# +function usage() { + local retcode="${1:-1}" # default return code is 1 + echo "Usage:" + echo " run.sh [options] [num events] [seed]" + echo " run.sh [options] [num events] [seed] [granularity]" + echo "Options:" + echo " -h, --help print this message and exit" + echo " -p, --parallel [num procs] number of processes to run in parallel" + echo " -m, --maxevts [num events] maximum number of unweighted events per job" + exit $retcode +} + if [[ -d ./madevent ]]; then DIR='./madevent' else @@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib # For Mac OS X export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib +pos_args=() +nprocs=1 +maxevts=2500 -if [[ ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then - num_events=$1 - seed=$2 - gran=1 -elif [[ ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then - num_events=$1 - seed=$2 - gran=$3 -else - echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED" -fi +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage 0 ;; + -p|--parallel) + nprocs="$2" && shift && shift ;; + -m|--maxevts) + maxevts="$2" && shift && shift ;; + -*) + echo "Error: Unknown option $1" && usage ;; + *) + pos_args+=("$1") && shift ;; + esac +done + +case `echo "${pos_args[@]}" | wc -w | tr -d " "` in + "2") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=1 + ;; + "3") + num_events=${pos_args[0]} + seed=${pos_args[1]} + gran=${pos_args[2]} + ;; + *) + echo "Error: number of arguments is not correct" + usage + ;; +esac -echo "Now generating $num_events events with random seed $seed and granularity $gran" +echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes" ############ RUN THE PYTHON CODE ##################### -${DIR}/bin/gridrun $num_events $seed $gran +${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts ######################################################## ########### POSTPROCESSING ##################### diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py index 42d82818d0..2efb5954a6 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py @@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False): assert "init" in self cross = dict(cross) - for key in cross.keys(): + for key in list(cross.keys()): if isinstance(key, str) and key.isdigit() and int(key) not in cross: cross[int(key)] = cross[key] @@ -1991,6 +1991,11 @@ def default_setup(self): self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle") self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.") + # Parameters only needed for main164 type of run (not pythia8/MG5 interface) + self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False, + comment="""Specify the type of output to be used by the main164 run. """) + self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False, + comment="Specify the HepMC output file to be used by the main164 run.") # Add parameters controlling the subruns execution flow. # These parameters should not be part of PY8SubRun daughter. self.add_default_subruns('parameters') @@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts): force = False if name.lower() not in self or (force or name.lower() not in self.user_set): self.__setitem__(name, value, change_userdefine=False, **opts) - self.system_set.add(name.lower()) - + self.system_set.add(name.lower()) + else: + raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name])) + def defaultSet(self, name, value, **opts): self.__setitem__(name, value, change_userdefine=False, **opts) @@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None): else: return ','.join([PY8Card.pythia8_formatting(arg) for arg in value]) + #change of name convention between MG5 old interface and main164 from Pythia8 + interface_to_164 = {'HEPMCoutput:file': 'HepMC:output', + 'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)', + 'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)', + 'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)', + 'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)', + 'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)', + 'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'} + def write(self, output_file, template, read_subrun=False, - print_only_visible=False, direct_pythia_input=False, add_missing=True): + print_only_visible=False, direct_pythia_input=False, add_missing=True, + use_mg5amc_py8_interface=False): """ Write the card to output_file using a specific template. > 'print_only_visible' specifies whether or not the hidden parameters should be written out if they are in the hidden_params_to_always_write @@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False, in the self.visible_params_to_always_write list and are not user_set or system_set are commented. > If 'add_missing' is False then parameters that should be written_out but are absent - from the template will not be written out.""" + from the template will not be written out. + > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not + if not used some parameters need to be translated from the old convention to the new one + """ + + self.use_mg5amc_py8_interface = use_mg5amc_py8_interface # First list the visible parameters visible_param = [p for p in self if p.lower() not in self.hidden_param @@ -2297,7 +2319,16 @@ def group_params(params): else: # Just copy parameters which don't need to be specified if param.lower() not in self.params_to_never_write: - output.write(line) + + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param.strip()] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + output.write('%s=%s\n'%(param_entry,new_value)) + else: + output.write(line) else: output.write('! The following parameter was forced to be commented out by MG5aMC.\n') output.write('! %s'%line) @@ -2313,6 +2344,7 @@ def group_params(params): if ((not direct_pythia_input) or (param.lower() in self.visible_params_to_always_write) or (param.lower() in self.user_set) or + (param.lower() in self.hidden_params_to_always_write) or (param.lower() in self.system_set)): template = '%s=%s' else: @@ -2321,6 +2353,19 @@ def group_params(params): # then they shouldn't be passed to Pythia template = '!%s=%s' + if not use_mg5amc_py8_interface and direct_pythia_input and \ + param in self.interface_to_164: + param_entry = self.interface_to_164[param] + # special case for HepMC needs two flags + if 'HepMC:output' == param_entry: + output.write(' %s=%s\n'%('Main:HepMC', 'on')) + if 'Main:InternalAnalysis'.lower() in self.user_set and \ + self['Main:InternalAnalysis'].lower() == 'on': + output.write('InternalAnalysis:output = ./djrs.dat\n') + + #elif param in self.interface_to_164.values() and not direct_pythia_input: + # misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param) + # raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry output.write(template%(param_entry, value_entry.replace(value,new_value))) @@ -2365,6 +2410,8 @@ def group_params(params): comment = '\n'.join('! %s'%c for c in self.comments[param.lower()].split('\n')) output.write(comment+'\n') + if not use_mg5amc_py8_interface and param in self.interface_to_164: + continue output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param]))) # Don't close the file if we were reading a subrun, but simply write @@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None): for i,line in enumerate(lines[:]): if search and re.search(include_pat, line): name = re.findall(include_pat, line)[0] - misc.sprint('DETECTED INCLUDE', name) if 'vector.inc' in name: search = False if 'run.inc' in name: @@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None): search = False sol.append(line) if re.search(function_pat, line): - misc.sprint("DETECTED FCT") search = True return sol @@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt): if 'fixed_fac_scale2' in card.user_set: card.user_set.remove('fixed_fac_scale2') - # #card['pdlabel1'] = value - # #card['pdlabel2'] = value + dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale']) + dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale']) @staticmethod def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt): @@ -4201,6 +4246,7 @@ def default_setup(self): self.add_param("bwcutoff", 15.0) self.add_param("cut_decays", False, cut='d') self.add_param('dsqrt_shat',0., cut=True) + self.add_param('dsqrt_shatmax', -1, cut=True) self.add_param("nhel", 0, include=False) self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.") #pt cut @@ -4451,11 +4497,11 @@ def check_validity(self): time.sleep(5) if self['drjj'] != 0: if 'drjj' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjj\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0') self['drjj'] = 0 if self['drjl'] != 0: if 'drjl' in self.user_set: - logger.warning('Since icckw>0, changing the value of \'drjl\' to 0') + logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0') self['drjl'] = 0 if not self['auto_ptj_mjj']: if self['mmjj'] > self['xqcut']: @@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def): self['fixed_fac_scale1'] = True self['nhel'] = 1 for i in beam_id_split[1]: - exit if abs(i) == 11: self['lpp1'] = -math.copysign(3,i) self['lpp2'] = math.copysign(3,i) @@ -5577,6 +5622,9 @@ def default_setup(self): #technical self.add_param('folding', [1,1,1], include=False) + + #bias + self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.") #merging self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3: FxFx Merging : http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1: NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]") @@ -5790,6 +5838,17 @@ def check_validity(self): if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8': raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8") + # check that the flavour_bias is consistent + if len(self['flavour_bias']) != 2: + raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.") + for i in self['flavour_bias']: + if i < 0: + raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter") + if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias': + logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"') + self['event_norm']='bias' + + # check that ebeam is bigger than the proton mass. for i in [1,2]: # do not for proton mass if not proton PDF (or when scan initialization) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py index bc785b5de6..a34705f6bc 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py @@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20): to_print = self.cross[-1:] for info in to_print: name = info['run_name'] - bench = info['bench'] + bench = [float(x) for x in info['bench']] data = [] for k in keys: if k in info: - data.append(info[k]) + data.append(float(info[k])) else: data.append(0.) ff.write(formatting % tuple([name] + bench + data)) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/cluster.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/cluster.py index 95ef45b5f3..66069293d2 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/cluster.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/cluster.py @@ -602,6 +602,7 @@ def __init__(self, *args, **opt): self.submitted = six.moves.queue.Queue() # one entry by job submitted self.stoprequest = threading.Event() #flag to ensure everything to close self.demons = [] + self.gpus_list = [] self.nb_done =0 if 'nb_core' in opt: self.nb_core = opt['nb_core'] @@ -623,23 +624,46 @@ def __init__(self, *args, **opt): self.done_pid_queue = six.moves.queue.Queue() self.fail_msg = None + mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES' + gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'], + ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],] + if mg5_gpu_env_str in os.environ: + new_var = os.environ[mg5_gpu_env_str].split(',') + if len(new_var) == 2: + gpu_variables.insert(0, new_var) + else: + logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str])) + for get_var,set_var in gpu_variables: + if get_var in os.environ: + self.gpus_list = os.environ.get(get_var).split(',') + self.gpu_set_var = set_var + self.gpus_count = len(self.gpus_list) + logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list)) def start_demon(self): import threading - t = threading.Thread(target=self.worker) + env2 = None + if len(self.gpus_list): + env2 = os.environ.copy() + this_gpu_idx = len(self.demons) % self.gpus_count + env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx] + t = threading.Thread(target=self.worker, kwargs={'env2': env2}) + else: + t = threading.Thread(target=self.worker) t.daemon = True t.start() self.demons.append(t) - def worker(self): + def worker(self, env2=None): import six.moves.queue import six.moves._thread while not self.stoprequest.isSet(): try: args = self.queue.get(timeout=10) tag, exe, arg, opt = args + opt['env'] = env2 try: # check for executable case if isinstance(exe,str): diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py index 9ff7390cf5..69291df0d4 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py @@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts): else: self.ninitial = self.proc_characteristics['ninitial'] - def make_make_all_html_results(self, folder_names = [], jobs=[]): - return sum_html.make_all_html_results(self, folder_names, jobs) + def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None): + return sum_html.make_all_html_results(self, folder_names, jobs, get_attr) def write_RunWeb(self, me_dir): @@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None): self.run_name, '%s_pts.dat' % tag) for observable_name, data_path in [('djr',djr_path), ('pt',pt_path)]: - if not self.generate_Pythia8_HwU_plots( + try: + if not self.generate_Pythia8_HwU_plots( PY8_plots_root_path, merging_scale_name, observable_name,data_path): - return False - + return False + except Exception as error: + if os.path.exists(data_path): + logger.info('plot information present in %s' % data_path) + return True if mode == 'Pythia8': plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot')) if not misc.which('gnuplot'): @@ -1964,12 +1968,16 @@ def do_systematics(self, line): self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status) except Exception: self.cluster.remove() + for i in range(nb_submit): + os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) old_run_mode = self.options['run_mode'] self.options['run_mode'] =0 + out =False try: out = self.do_systematics(line) finally: self.options['run_mode'] = old_run_mode + return out #collect the data all_cross = [] for i in range(nb_submit): @@ -1995,18 +2003,21 @@ def do_systematics(self, line): self.run_card['event_norm'] in ['unity']: all_cross= [cross/nb_event for cross in all_cross] - sys_obj = systematics.call_systematics([input, None] + opts, - log=lambda x: logger.info(str(x)), - result=result_file, - running=False - ) + + sys_obj = systematics.call_systematics([input, None] + opts, + log=lambda x: logger.info(str(x)), + result=result_file, + running=False + ) + sys_obj.print_cross_sections(all_cross, nb_event, result_file) - + #concatenate the output file subprocess.call(['cat']+\ ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)], stdout=open(output,'w'), cwd=os.path.dirname(output)) + for i in range(nb_submit): os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output))) # os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i)) @@ -3831,7 +3842,7 @@ def store_scan_result(self): """return the information that need to be kept for the scan summary. Auto-width are automatically added.""" - return {'cross': self.results.current['cross']} + return {'cross': self.results.current['cross'], 'error': self.results.current['error']} def add_error_log_in_html(self, errortype=None): @@ -5135,10 +5146,10 @@ def init_run(self, cards): self.special_shortcut.update( {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']), 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]), - 'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']), - 'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), + 'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']), 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']), 'no_parton_cut':([],['run_card nocut T']), 'cm_velocity':([float], [lambda self :self.set_CM_velocity]), @@ -6740,7 +6751,15 @@ def postcmd(self, stop, line): return ending_question - + def help_update(self): + logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model. + update missing: add to the current param_card missing blocks/parameters. + update to_slha1: pass SLHA2 card to SLHA1 convention. (beta) + update to_slha2: pass SLHA1 card to SLHA2 convention. (beta) + update to_full [run_card] + update XXX [where XXX correspond to a hidden block of the run_card]: + supported block are %s + """, ', '.join(self.update_block)) def do_update(self, line, timer=0): @@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0): logger.warning('miss an argument (dependent or missing). Please retry') return + args[0] = args[0].lower() + if args[0] == 'dependent': if not self.mother_interface: logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)') @@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0): self.modified_card.add('run') # delay writting of the run_card logger.info('add optional block %s to the run_card', args[0]) else: - self.help_update() + self.do_help('update') logger.warning('unvalid options for update command. Please retry') + def update_to_full(self, line): """ trigger via update to_full LINE""" diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py index 789976beee..c321fd88e5 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py @@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line): debug_file = open(self.debug_output, 'a') traceback.print_exc(file=debug_file) + if __debug__: + traceback.print_exc() if hasattr(error, 'filename'): debug_file.write("Related File: %s\n" % error.filename) # Create a nice error output @@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout): for i, name in enumerate(split): try: __import__('.'.join(split[:i+1])) - exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1]))) + tmp = {} + exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp) except ImportError: try: var = eval(args[1]) @@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout): outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) else: - var = eval(args[1]) + var = eval(args[1], globals(), tmp) outstr += 'EXTERNAL:\n' outstr += misc.nice_representation(var, nb_space=4) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py index 526756129f..74ba0d195c 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py @@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}): else: raise self.FileWriterError("%s not string" % repr(input_lines)) - # Setup the contextual environment - for contextual_variable, value in context.items(): - exec('%s=%s'%(str(contextual_variable),repr(value))) - res = [] # The variable below tracks the conditional statements structure if_stack = [] @@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}): # Treat an if statement elif preproc_command.group('command')=='if': try: - if_stack.append(eval(preproc_command.group('body'))==True) + if_stack.append(eval(preproc_command.group('body'), globals(), context)==True) except Exception as e: raise self.FilePreProcessingError('Could not evaluate'+\ "python expression '%s' given the context %s provided."%\ diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py index 551b71ddb6..3061b007e7 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py @@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False): path2 = format_path(path2) try: shutil.copy(path1, path2) + except shutil.Error as why: + logger.debug('no cp since identical: %s', why) + return except IOError as why: import madgraph.various.misc as misc try: + if 'same file' in str(why): + return if os.path.exists(path2): path2 = os.path.join(path2, os.path.split(path1)[1]) misc.copytree(path1, path2) @@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False): if error: raise if log: - logger.warning(why) + logger.warning("fail to cp", path1, path2, why) else: - misc.sprint("fail to cp", why) - except shutil.Error: - # idetical file - pass + misc.sprint("fail to cp",path1,path2, why) + def rm(path, log=True): """removes path, that can be a single element or a list""" diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl index 1810c6c082..6e0e06533d 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl @@ -137,7 +137,7 @@ until($listpos>$#incard){ print PAGE " Model: $model \n"; print PAGE " \n \n
\n"; print PAGE " \n"; - print PAGE "\"\" \n"; + print PAGE "\"\" \n"; print PAGE "
\n"; print PAGE " \n \n \n"; print PAGE " \n"; diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py index 681bf9d09b..3114a4350c 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py @@ -133,7 +133,7 @@ class AllResults(dict): web = False - _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics', + _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics', 'nb_event','cross_pythia','error_pythia', 'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir'] diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl index 87d03da394..31b7e9fe55 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl @@ -1,16 +1,16 @@ #!/usr/bin/perl -w #--------------------------------------------------------------------- -# Run GS to create jpeg files defined as $gs +# Run GS to create PNG files defined as $gs #--------------------------------------------------------------------- -system("/bin/bash -c \"rm -f matrix*.jpg\" "); +system("/bin/bash -c \"rm -f matrix*.png\" "); $imatrix = ""; if (! -e "matrix.ps") {$imatrix = 1;} -$max_jpg = 2; -if ($imatrix eq "") {$max_jpg = 5;} -# add 1 to max_jpg, to get max_jpg pages -$max_jpg += 1; +$max_png = 2; +if ($imatrix eq "") {$max_png = 5;} +# add 1 to max_png, to get max_png pages +$max_png += 1; open(PAGE,"> diagrams.html") || die "Error creating diagrams.html"; print PAGE "\ \n"; print PAGE "\ \n"; @@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){ open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps"; open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps"; while () { - if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;} + if ($_ =~ m/^%%Page: $max_png $max_png/) {last;} else {print OUT $_, "\n";} } close(OUT); close(IN); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\""; system "rm -f matrix-1.ps"; -# Determine how many jpg files we have +# Determine how many png files we have $pages=1; - while(-e "matrix$imatrix$pages.jpg"){ + while(-e "matrix$imatrix$pages.png"){ $pages++; }#end of while #reduce it by one - if ($pages > $max_jpg){ + if ($pages > $max_png){ $pages -= 1; } # Find name of process @@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){ if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; } print PAGE "

To save bandwidth not all diagrams were converted to jpeg."; + if (-e "matrix$imatrix$max_png.png" ) { + print PAGE "

To save bandwidth not all diagrams were converted to PNG."; print PAGE "

To view all diagrams click on "; print PAGE "\ postscript. \<\/A\> \ \n"; # # Delete files which aren't included in diagrams.html # - system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" "); + system ("/bin/bash -c \"rm -f matrix$max_png.png\" "); } # -# Now create jpeg file for card +# Now create PNG file for card # - if (! -e "../../HTML/card.jpg") { + if (! -e "../../HTML/card.png") { system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" "); open(JUNK,">> junk.ps") || die "Error opening junk.ps"; @@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){ system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" "); - system "/bin/bash -c \"nice gs \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" "; + system "/bin/bash -c \"nice gs \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" "; } if ($imatrix eq "") {$imatrix = 0;} $imatrix = $imatrix + 1; @@ -82,3 +82,4 @@ print PAGE "\n"; print PAGE "\<\/BODY\> \n"; print PAGE "\<\/HTML\> \n"; close(PAGE); + diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py index 415ecc9de0..d5d7fc8faf 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py @@ -30,6 +30,7 @@ import stat import sys import six +import time from six.moves import range from six.moves import zip @@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True): logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp ) if len(good_hels) == 1: files.cp(matrix_file, matrix_file.replace('orig','optim')) + files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o')) continue # avoid optimization if onlye one helicity gauge = self.cmd.proc_characteristics['gauge'] @@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None): # parameter for the gridpack run self.nreq = 2000 self.iseed = 4321 + self.maxevts = 2500 # placeholder for information self.results = 0 #updated in launch/update_html @@ -1200,6 +1203,10 @@ def reset_multijob(self): def write_multijob(self, Channel, nb_split): """ """ if nb_split <=1: + try: + os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat')) + except OSError: + pass return f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w') f.write('%i\n' % nb_split) @@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4): max_request_event = 1e12 # split jobs if a channel if it needs more than that max_event_in_iter = 4000 min_event_in_iter = 500 - combining_job = sys.maxsize gen_events_security = 1.00 - def __new__(cls, *args, **opts): + def __new__(cls, cmd, opts): cls.force_class = 'gridpack' - return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts) + return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts) - def __init__(self, *args, **opts): + def __init__(self, cmd, opts): self.ngran = -1 + self.nprocs = 1 self.gscalefact = {} self.readonly = False if 'ngran' in opts: @@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts): # del opts['ngran'] if 'readonly' in opts: self.readonly = opts['readonly'] - super(gen_ximprove_gridpack,self).__init__(*args, **opts) + if 'nprocs' in opts: + self.nprocs = int(opts['nprocs']) + if 'maxevts' in opts and self.nprocs > 1: + self.max_request_event = int(opts['maxevts']) + super(gen_ximprove_gridpack,self).__init__(cmd, opts) if self.ngran == -1: self.ngran = 1 + + if self.nprocs > 1: + self.combining_job = 0 + else: + self.combining_job = sys.maxsize def find_job_for_event(self): """return the list of channel that need to be improved""" @@ -1876,8 +1892,8 @@ def find_job_for_event(self): continue # no event to generate events self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran)) #need to generate events - logger.debug('request events for ', C.get('name'), 'cross=', - C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec')) + logger.debug('request events for %s cross=%d needed events = %d', + C.get('name'), C.get('axsec'), goal_lum * C.get('axsec')) to_refine.append(C) logger.info('need to improve %s channels' % len(to_refine)) @@ -1897,8 +1913,13 @@ def get_job_for_event(self): for C in to_refine: #1. Compute the number of points are needed to reach target needed_event = max(goal_lum*C.get('axsec'), self.ngran) - nb_split = 1 - + nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1)) + if not self.split_channels: + nb_split = 1 + if nb_split > self.max_splitting: + nb_split = self.max_splitting + nb_split=max(1, nb_split) + #2. estimate how many points we need in each iteration if C.get('nunwgt') > 0: nevents = needed_event / nb_split * (C.get('nevents') / C.get('nunwgt')) @@ -1908,13 +1929,16 @@ def get_job_for_event(self): nevents = self.max_event_in_iter if nevents < self.min_event_in_iter: + nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous? nevents = self.min_event_in_iter # # forbid too low/too large value nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents)) logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents) - + # write the multi-job information + self.write_multijob(C, nb_split) + #create the info dict assume no splitting for the default info = {'name': self.cmd.results.current['run_name'], 'script_name': 'unknown', @@ -1925,7 +1949,7 @@ def get_job_for_event(self): 'nevents': nevents, #int(nevents*self.gen_events_security)+1, 'maxiter': self.max_iter, 'miniter': self.min_iter, - 'precision': -1*int(needed_event)/C.get('axsec'), + 'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'), 'requested_event': needed_event, 'nhel': self.run_card['nhel'], 'channel': C.name.replace('G',''), @@ -1938,27 +1962,59 @@ def get_job_for_event(self): basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory']) info['base_directory'] = basedir - jobs.append(info) - + if nb_split == 1: + jobs.append(info) + else: + for i in range(nb_split): + new_info = dict(info) + new_info['offset'] = i+1 + new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26) + new_info['base_directory'] = info['directory'] + jobs.append(new_info) write_dir = '.' if self.readonly else None self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) + if self.nprocs > 1: + nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs) + gridpack_start = time.time() + def gridpack_wait_monitoring(Idle, Running, Done): + if Idle+Running+Done == 0: + return + logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" + % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start))) + done = [] for j in jobs: - if j['P_dir'] in done: - continue - done.append(j['P_dir']) + if self.nprocs == 1: + if j['P_dir'] in done: + continue + done.append(j['P_dir']) + # Give a little status. Sometimes these jobs run very long, and having hours without any + # console output can be a bit frightening and make users think we are looping. + if len(done)%5==0: + logger.info(f"Working on job {len(done)} of {len(jobs)}") + # set the working directory path. pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir']) - exe = pjoin(pwd, 'ajob1') + exe = pjoin(pwd, j['script_name']) st = os.stat(exe) os.chmod(exe, st.st_mode | stat.S_IEXEC) # run the code\ - cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + if self.nprocs == 1: + cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet']) + else: + nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet']) write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses') + if self.nprocs > 1: + nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring) + + if self.readonly: + combine_runs.CombineRuns(write_dir) + else: + combine_runs.CombineRuns(self.me_dir) self.check_events(goal_lum, to_refine, jobs, write_dir) def check_events(self, goal_lum, to_refine, jobs, Sdir): diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py index 1471de4bcb..978ba6575e 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py @@ -550,7 +550,7 @@ def get_jamp_lines(self, line): def get_amp2_lines(self, line): if line.startswith(' DO I = 1, NCOLOR'): self.in_amp2 = False - elif not line.isspace(): + elif not line.isspace() and 'DENOM' not in line: self.template_dict['amp2_lines'] += f'{line[0:6]} {self.add_indices(line[6:])}' def prepare_bools(self): diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py index 51ae2914fc..6fbdd98100 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py @@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header, boundaries = [0.0,0.0] for j, weight in \ enumerate(HwU.histo_bin_weight_re.finditer(line_bin)): - if (j == len(weight_header)): - continue - if j == len(all_weight_header): - raise HwU.ParseError("There is more bin weights"+\ - " specified than expected (%i)"%len(weight_header)) + #if (j == len(weight_header)): + # continue if selected_central_weight == all_weight_header[j]: bin_weights['central'] = float(weight.group('weight')) if all_weight_header[j] == 'boundary_xmin': @@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None, # If merging cut is negative, then pick only the one of the central scale # If not specified, then take them all but use the PDF and scale weight # of the central merging_scale for the variation. + if not all_weights: + raise MadGraph5Error('No weights were found in the HwU XML source.') if merging_scale is None or merging_scale < 0.0: merging_scale_chosen = all_weights[2]['MERGING'] else: @@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index, # return [template_no_stat%rep_dic]+\ # ([template%rep_dic] if show_mc_uncertainties else []) - # The use of sqrt(-1) is just a trick to prevent the line to display + # The use of 1/0 is just a trick to prevent the line to display res = [] - rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)' + rep_dic['data'] = '($3 < 0 ? 1/0 : $3)' res.append(template_no_stat%rep_dic) rep_dic['title'] = " title ''" if show_mc_uncertainties: res.append(template%rep_dic) - rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))' + rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))' rep_dic['ls'] = ' ls %d'%(100+color_index) res.append(template_no_stat%rep_dic) if show_mc_uncertainties: @@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB): """#-- rendering subhistograms '%(subhistogram_type)s' %(unset label)s %(set_format_y)s +%(set_yscale)s set yrange [%(ymin).4e:%(ymax).4e] set origin %(origin_x).4e, %(origin_y).4e set size %(size_x).4e, %(size_y).4e set mytics %(mytics)d %(set_ytics)s %(set_format_x)s -%(set_yscale)s %(set_ylabel)s %(set_histo_label)s plot \\""" @@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB): # We decide to show uncertainties in the main plot only if they # are part of a monocolor band. Otherwise, they will only be - # shown in the first subplot. Notice that plotting 'sqrt(-1)' + # shown in the first subplot. Notice that plotting '1/0' # is just a trick so as to have only the key printed with no # line @@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, scale variation'%title, band='scale' in use_band) else: uncertainty_plot_lines[-1]['scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)] # And now PDF_variation if available if not PDF_var_pos is None and len(PDF_var_pos)>0: if 'pdf' in use_band: @@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, PDF variation'%title, band='pdf' in use_band) else: uncertainty_plot_lines[-1]['pdf'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)] # And now merging variation if available if not merging_var_pos is None and len(merging_var_pos)>0: if 'merging_scale' in use_band: @@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, merging scale variation'%title, band='merging_scale' in use_band) else: uncertainty_plot_lines[-1]['merging_scale'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)] # And now alpsfact variation if available if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0: if 'alpsfact' in use_band: @@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB): '%s, alpsfact variation'%title, band='alpsfact' in use_band) else: uncertainty_plot_lines[-1]['alpsfact'] = \ - ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] + ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)] # plot_lines.append( # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\ diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py index 0924927785..262d39a736 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin. import logging import os @@ -33,7 +33,7 @@ def compile(self, *args, **opts): if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'] }) + {'override FPTYPE': self.run_card['floating_type'] }) misc.sprint('FPTYPE checked') cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ] if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): @@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value}) else: raise Exception Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') @@ -133,7 +133,8 @@ def default_setup(self): super().default_setup() # change default value: self['cudacpp_backend'] = 'cuda' - self['vector_size'] = 16384 # already setup in default class (just change value) + self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value) + self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py index f6e47956cd..81d17f7cb1 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py @@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): from_init = True if not from_init: - if group in grouped_cross: - grouped_cross[group] += self.allcross[i] - grouped_error[group] += self.error[i]**2 + if int(group) in grouped_cross: + grouped_cross[int(group)] += self.allcross[i] + grouped_error[int(group)] += self.error[i]**2 else: - grouped_cross[group] = self.allcross[i] - grouped_error[group] = self.error[i]**2 + grouped_cross[int(group)] = self.allcross[i] + grouped_error[int(group)] = self.error[i]**2 else: ban = banner_mod.Banner(ff.banner) for line in ban['init'].split('\n'): @@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): if len(splitline)==4: cross, error, _, group = splitline if int(group) in grouped_cross: - grouped_cross[group] += float(cross) - grouped_error[group] += float(error)**2 + grouped_cross[int(group)] += float(cross) + grouped_error[int(group)] += float(error)**2 else: - grouped_cross[group] = float(cross) - grouped_error[group] = float(error)**2 + grouped_cross[int(group)] = float(cross) + grouped_error[int(group)] = float(error)**2 nb_group = len(grouped_cross) # compute the information for the first line @@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None): self.seek(0) if init_information["idbmup2"] in [0,9]: event = next(self) + if len(event) == 0: + event = Event(str(event)) init_information["idbmup2"] = event[1].pdg self.seek(0) @@ -2166,10 +2168,13 @@ def check(self): abspz += abs(particle.pz) # check mass fourmass = FourMomentum(particle).mass - - if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold: - raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) - + if particle.mass: + expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E + if expected > 1e-8: + mass_threshold = particle.E**2 - (particle.E-threshold)**2 + if (abs(particle.mass) - fourmass)/ mass_threshold > 5: + raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass))) + if E/absE > threshold: logger.critical(self) @@ -2953,8 +2958,8 @@ def pt(self): @property def pseudorapidity(self): - norm = math.sqrt(self.px**2 + self.py**2+self.pz**2) - return 0.5* math.log((norm - self.pz) / (norm + self.pz)) + norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2) + return 0.5* math.log((norm + self.pz) / (norm - self.pz)) @property def rapidity(self): diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py index 85e5bcf5e3..4d5597c722 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py @@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'): for opt,value in self._survey_options.items(): if arg.startswith('--%s=' % opt): exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \ - (opt, value[0])) + (opt, value[0]), globals(), {'self':self, 'arg':arg}) arg = "" if arg != "": raise Exception - except Exception: + except Exception as error: self.help_survey() raise self.InvalidCmd('invalid %s argument'% arg) @@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data): logger.info(" Nb of events after matching/merging : %d" % int(data['nb_event_pythia'])) if self.run_card['use_syst'] in self.true and \ (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0 - or self.run_card['ptlund']>0.0): + or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1: logger.info(" Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\ " The resulting hepmc/stdhep file should therefore be use with those weights.") - else: + elif data['cross_pythia'] == -1: logger.info(" Nb of events after merging : %s" % data['nb_event_pythia']) logger.info(" " ) @@ -3055,6 +3055,7 @@ def do_multi_run(self, line): crossoversig = 0 inv_sq_err = 0 nb_event = 0 + madspin = False for i in range(nb_run): self.nb_refine = 0 self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False) @@ -3067,6 +3068,8 @@ def do_multi_run(self, line): inv_sq_err+=1.0/error**2 self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err) + if 'decayed' in self.run_name: + madspin = True self.results.def_current(main_name) self.run_name = main_name self.update_status("Merging LHE files", level='parton') @@ -3074,9 +3077,12 @@ def do_multi_run(self, line): os.mkdir(pjoin(self.me_dir,'Events', self.run_name)) except Exception: pass - os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' + + os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'), - 'name': self.run_name}) + 'name': self.run_name, + 'madspin': '_decayed_*' if madspin else '' + }) eradir = self.options['exrootanalysis_path'] if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')): @@ -3656,9 +3662,11 @@ def do_refine(self, line): else: self.refine_mode = "new" - cross, error = self.make_make_all_html_results() + cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec')) + self.results.add_detail('cross', cross) self.results.add_detail('error', error) + self.results.add_detail('axsec', across) self.results.add_detail('run_statistics', dict(self.results.get_detail('run_statistics'))) @@ -3757,6 +3765,8 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + Gdirs = self.remove_empty_events(Gdirs) + partials_info = [] if len(Gdirs) >= max_G: start_unweight= time.perf_counter() @@ -3786,7 +3796,7 @@ def split(a, n): for i, local_G in enumerate(split(Gdirs, nb_chunk)): line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)] line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag))) - line.append(str(self.results.current['cross'])) + line.append(str(self.results.current.get('axsec'))) line += local_G partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True)) mycluster.submit(sys.executable, @@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options): return None - def setup_Pythia8RunAndCard(self, PY8_Card, run_type): + def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface): """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters of the card are automatically set here. This function returns the path where HEPMC events will be output, if any.""" @@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True) # Automatically set qWeed to xqcut if not defined by the user. - if PY8_Card['SysCalc:qWeed']==-1.0: + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0: PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True) - if PY8_Card['SysCalc:qCutList']=='auto': + if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto': if self.run_card['use_syst']: if self.run_card['sys_matchscale']=='auto': qcut = PY8_Card['JetMatching:qCut'] @@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): # Specific MLM settings # PY8 should not implement the MLM veto since the driver should do it # if merging scale variation is turned on - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('JetMatching:doVeto',False) @@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type): PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1) PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False) # PY8 should not implement the CKKW veto since the driver should do it. - if self.run_card['use_syst']: + if use_mg5amc_py8_interface and self.run_card['use_syst']: # We do no force it here, but it is clear that the user should know what # he's doing if he were to force it to True. PY8_Card.MadGraphSet('Merging:applyVeto',False) @@ -4516,6 +4526,12 @@ def do_pythia8(self, line): else: no_default = False + if '--old_interface' in args: + use_mg5amc_py8_interface = True + args.remove('--old_interface') + else: + use_mg5amc_py8_interface = False + if not self.run_name: self.check_pythia8(args) self.configure_directory(html_opening =False) @@ -4545,20 +4561,27 @@ def do_pythia8(self, line): #"Please use 'event_norm = average' in the run_card to avoid this problem.") - - if not self.options['mg5amc_py8_interface_path'] or not \ - os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface')): - raise self.InvalidCmd( -"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. -Please install this tool with the following MG5_aMC command: - MG5_aMC> install mg5amc_py8_interface_path""") + if use_mg5amc_py8_interface: + if not self.options['mg5amc_py8_interface_path'] or not \ + os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface')): + raise self.InvalidCmd( + """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower. + Please install this tool with the following MG5_aMC command: + MG5_aMC> install mg5amc_py8_interface_path""") + else: + pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], + 'MG5aMC_PY8_interface') + warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) + if warnings: + logger.warning(warnings) else: - pythia_main = pjoin(self.options['mg5amc_py8_interface_path'], - 'MG5aMC_PY8_interface') - warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options) - if warnings: - logger.warning(warnings) + pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164') + if not os.path.exists(pythia_main): + pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') + if not os.path.exists(pythia_main): + logger.warning('main164 not found (or not compiled). Will try the old interface instead.') + return self.do_pythia8(line + ' --old_interface') self.results.add_detail('run_mode', 'madevent') @@ -4583,14 +4606,19 @@ def do_pythia8(self, line): run_type = 'CKKW' # Edit the card and run environment according to the run specification - HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type) + HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface) + + if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + PY8_Card['Main:numberOfEvents']= self.run_card['nevents'] + # Now write the card. pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name , '%s_pythia8.cmd' % tag) cmd_card = StringIO.StringIO() PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Now setup the preamble to make sure that everything will use the locally # installed tools (if present) even if the user did not add it to its @@ -4632,7 +4660,7 @@ def do_pythia8(self, line): " command '/usr/bin/env %s' exists and returns a valid path."%shell) exe_cmd = "#!%s\n%s"%(shell_exe,' '.join( - [preamble+pythia_main, + [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c', os.path.basename(pythia_cmd_card)])) wrapper.write(exe_cmd) @@ -4699,6 +4727,7 @@ def do_pythia8(self, line): n_cores = max(min(min_n_core,n_cores),1) if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1): + # No need for parallelization anymore self.cluster = None logger.info('Follow Pythia8 shower by running the '+ @@ -4744,20 +4773,22 @@ def do_pythia8(self, line): ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz') ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'), pjoin(self.me_dir,'Cards','pythia8_card_default.dat'), - direct_pythia_input=True) + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) # Write the wrapper wrapper_path = pjoin(parallelization_dir,'run_PY8.sh') wrapper = open(wrapper_path,'w') if self.options['cluster_temp_path'] is None: exe_cmd = \ -"""#!%s -./%s PY8Card.dat >& PY8_log.txt -""" +"""#!%%s +./%%s %s PY8Card.dat >& PY8_log.txt +""" % ('' if use_mg5amc_py8_interface else '-c') + else: exe_cmd = \ -"""#!%s +"""#!%%s ln -s ./events_$1.lhe.gz ./events.lhe.gz -./%s PY8Card_$1.dat >& PY8_log.txt +./%%s %s PY8Card_$1.dat >& PY8_log.txt mkdir split_$1 if [ -f ./events.hepmc ]; then @@ -4776,7 +4807,7 @@ def do_pythia8(self, line): mv ./PY8_log.txt ./split_$1/ fi tar -czf split_$1.tar.gz split_$1 -""" +""" % ('' if use_mg5amc_py8_interface else '-c') exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main)) wrapper.write(exe_cmd) wrapper.close() @@ -4812,19 +4843,27 @@ def do_pythia8(self, line): pjoin(parallelization_dir,split_files[-1])) logger.info('Submitting Pythia8 jobs...') + for i, split_file in enumerate(split_files): # We must write a PY8Card tailored for each split so as to correct the normalization # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the # same original number of events - split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat')) + split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user') + assert split_PY8_Card['JetMatching:nJetMax'] == PY8_Card['JetMatching:nJetMax'] + + + # Make sure to sure the number of split_events determined during the splitting. - split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i]) + split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True) + assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i] split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']* - (float(partition_for_PY8[i]))) + (float(partition_for_PY8[i])), force=True) # Add_missing set to False so as to be sure not to add any additional parameter w.r.t # the ones in the original PY8 param_card copied. split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i), - pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False) + pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False, + direct_pythia_input=True, + use_mg5amc_py8_interface=use_mg5amc_py8_interface) in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)), pjoin(parallelization_dir,'PY8Card_%d.dat'%i), pjoin(parallelization_dir,split_file)] @@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done): # works both for fixed number of generated events and fixed accepted events self.results.add_detail('error_pythia', error_m) - if self.run_card['use_syst']: + if self.run_card['use_syst'] and use_mg5amc_py8_interface: self.results.add_detail('cross_pythia', -1) self.results.add_detail('error_pythia', 0) @@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None): mfactors[pjoin(P, "G%s" % tag)] = mfactor self.Gdirs = (Gdirs, mfactors) return self.get_Gdir(Pdir, symfact=symfact) + + ############################################################################ + def remove_empty_events(self, Gdir): + """return Gdir strip from the one providing empty events.lhe files.""" + + reasons = collections.defaultdict(list) + Gdirs = Gdir[:] + for G in Gdirs[:]: + try: + size = os.path.getsize(pjoin(G, 'events.lhe')) + except Exception as error: + size = 0 + if size <10: + Gdirs.remove(G) + try: + log = misc.BackRead(pjoin(G, 'log.txt')) + except Exception as error: + log = misc.BackRead(pjoin(G, 'run1_app.log')) + found = -1 + for line in log: + if 'Deleting file events.lhe' in line: + found = 0 + elif "Impossible BW configuration" in line: + reasons['bwconfig'].append(G) + break + elif found < -150: + reasons['not found'].append(G) + Gdirs.append(G) + break + elif found < 0: + found -= 1 + elif 'Loosen cuts or increase max_events' in line: + reasons['cuts'].append(G) + break + elif 'all returned zero' in line: + reasons['zero'].append(G) + break + elif found > 5: + reasons['unknown'].append(G) + break + else: + found += 1 + + if len(reasons): + logger.debug('Reasons for empty events.lhe:') + if len(reasons['unknown']): + logger.debug(' - unknown: %s' % len(reasons['unknown'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]])) + if len(reasons['not found']): + logger.debug(' - not found in log: %s' % len(reasons['not found'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]])) + if len(reasons['zero']): + logger.debug(' - zero amplitudes: %s' % len(reasons['zero'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]])) + if len(reasons['bwconfig']): + critical_bwconfig = set() + for G in reasons['bwconfig']: + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:])) + for G in critical_bwconfig: + logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) + + logger.debug(' - impossible BW configuration: %s' % len(reasons['bwconfig'])) + logger.debug(' - channel with no possible BW configuration: %s' % len(critical_bwconfig)) + + if len(reasons['cuts']): + critical_nb_cuts = collections.defaultdict(int) + for G in reasons['cuts']: + if '.' in os.path.basename(G): + base = G.rsplit('.',1)[0] + if any(G2.startswith(base) for G2 in Gdirs): + continue + else: + critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1 + else: + critical_nb_cuts[''] += 1 + logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G) + for G, nb in critical_nb_cuts.items(): + if not G: + continue + else: + logger.warning('%s channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G)) + logger.debug(' - no points passed cuts: %s' % len(reasons['cuts'])) + logger.log(10, ' DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]])) + logger.debug(' - without any BW handling (critical): %s' % critical_nb_cuts['']) + logger.debug(' - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0)) + #logger.debug(' - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts)) + + + return Gdirs + + ############################################################################ def set_run_name(self, name, tag=None, level='parton', reload_card=False, allow_new_tag=True): @@ -6749,7 +6883,7 @@ def get_subP_ids(path): class GridPackCmd(MadEventCmd): """The command for the gridpack --Those are not suppose to be use interactively--""" - def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin): + def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin): """Initialize the command and directly run""" # Initialize properly @@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s self.random = seed self.random_orig = self.random self.granularity = gran + self.nprocs = nprocs + self.maxevts = maxevts self.options['automatic_html_opening'] = False #write the grid_card.dat on disk @@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed): #misc.call([pjoin(self.me_dir,'bin','refine4grid'), # str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed], # cwd=self.me_dir) - self.refine4grid(nb_event) + self.gridpack_cross = self.refine4grid(nb_event) # 3) Combine the events/pythia/... self.exec_cmd('combine_events') @@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event): precision = nb_event + across= self.make_make_all_html_results(get_attr='axsec') + self.opts = dict([(key,value[1]) for (key,value) in \ self._survey_options.items()]) @@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event): self.update_status('Refine results to %s' % precision, level=None) logger.info("Using random number seed offset = %s" % self.random) - refine_opt = {'err_goal': nb_event, 'split_channels': False, - 'ngran':self.granularity, 'readonly': self.readonly} + refine_opt = {'err_goal': nb_event, 'split_channels': True, + 'ngran':self.granularity, 'readonly': self.readonly, + 'nprocs': self.nprocs, 'maxevts': self.maxevts} x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt) x_improve.launch() # create the ajob for the refinment and run those! self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack @@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event): #print 'run combine!!!' #combine_runs.CombineRuns(self.me_dir) - return + return across #update html output Presults = sum_html.collect_result(self) cross, error = Presults.xsec, Presults.xerru @@ -7051,10 +7190,13 @@ def do_combine_events(self, line): sum_axsec += result.get('axsec')*gscalefact[Gdir] if len(AllEvent) >= 80: #perform a partial unweighting - if self.results.current['cross'] == 0 and self.run_card['gridpack']: - nb_event= self.nb_event + if not self.results.current.get('axsec'): + if self.run_card['gridpack'] and self.gridpack_cross: + nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event) + else: + nb_event= self.nb_event else: - nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents']) + nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec) AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials), get_wgt, log_level=5, trunc_error=1e-2, event_target=nb_event) AllEvent = lhe_parser.MultiEventFile() @@ -7068,6 +7210,7 @@ def do_combine_events(self, line): for data in partials_info: AllEvent.add(*data) + sum_xsec += data[1] if not hasattr(self,'proc_characteristic'): self.proc_characteristic = self.get_characteristics() diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data index 6205bb9567..407ed7aa91 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data @@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do cd ../ done +# check if we are on a Mac, otherwise assume Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # no nproc on Mac, so use sysctl instead + # use -S1024 because there is a limit on the length of the command + xargs_opts="-P $(sysctl -n hw.ncpu) -S1024" +else + xargs_opts="-P $(nproc --all)" +fi + find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \ - | xargs --null -P "$(nproc --all)" -I{} bash -c " + | xargs --null ${xargs_opts} -I{} bash -c " cd {} for j in $1_results.dat ; do if [[ -e \$j ]] ; then diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py index 9dd5826f71..fb8dd3a74a 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py @@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None): return all -def make_all_html_results(cmd, folder_names = [], jobs=[]): +def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None): """ folder_names and jobs have been added for the amcatnlo runs """ run = cmd.results.current['run_name'] if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)): @@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]): fsock.write('%s

' % Presults.get_html(run, unit, cmd.me_dir)) fsock.write('%s
' % P_text) - return Presults.xsec, Presults.xerru + if not get_attr: + return Presults.xsec, Presults.xerru + else: + if isinstance(get_attr, tuple): + return [getattr(Presults, _) for _ in get_attr] + return getattr(Presults, get_attr) diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/madevent b/epochX/cudacpp/susy_gg_tt.mad/bin/madevent index dff9711b73..9c5363e682 100755 --- a/epochX/cudacpp/susy_gg_tt.mad/bin/madevent +++ b/epochX/cudacpp/susy_gg_tt.mad/bin/madevent @@ -178,6 +178,17 @@ force_run = False if (args and args[0] == 'treatcards'): force_run=True + +# check that madgraph is not in PYTHONPATH +try: + import madgraph +except ImportError: + pass +else: + logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.') + + + # Call the cmd interface main loop try: if '-h' in args or '--help' in args: diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h index 9ed58e24f1..22a10cc1e3 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -873,7 +873,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -924,7 +924,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -962,7 +962,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -974,7 +974,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -987,7 +987,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1006,7 +1006,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1019,7 +1019,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc index aa00d6a9e4..0a62a7059c 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h index 3e29f2ccbe..2b51d933c5 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -878,7 +878,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -888,10 +888,10 @@ namespace mg5amcCpu using namespace Parameters_MSSM_SLHA2_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); - fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); - cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); - cxtype_sv_ref GC_51s_sv = C_ACCESS::kernelAccess( GC_51s ); + fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); + fptype* GC_51s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); + cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s ); + cxtype_sv_ref GC_51s_sv = CD_ACCESS::kernelAccess( GC_51s ); GC_6s_sv = couplings_sv.GC_6; GC_51s_sv = couplings_sv.GC_51; mgDebug( 1, __FUNCTION__ ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h index 7c6a082392..ca859a602e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 420090461f..0cf1202c7e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT * * * * * * * * * * * * -* VERSION 3.6.0 2024-09-30 * +* VERSION 3.6.3 2025-06-12 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * @@ -49,12 +49,15 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 +INFO: load particles +INFO: load vertices +DEBUG: model prefixing takes 0.9217190742492676  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -550,45 +553,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.118 s +1 processes with 3 diagrams generated in 0.097 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 171]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 176]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 218]  -DEBUG: type(subproc_group)= [output.py at line 219]  -DEBUG: type(fortran_model)= [output.py at line 220]  -DEBUG: type(me)= me=0 [output.py at line 221]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 222]  -INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  +DEBUG: type(subproc_group)= [output.py at line 223]  +DEBUG: type(fortran_model)= [output.py at line 224]  +DEBUG: type(me)= me=0 [output.py at line 225]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  +INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.136 s +ALOHA: aloha creates 2 routines in 0.131 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.291s -user 0m1.202s +real 0m2.336s +user 0m2.218s sys 0m0.072s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h index 87aa648dd2..a45024704a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -255,11 +255,15 @@ namespace mg5amcCpu throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! m_gpublocks = m_nevt / m_gputhreads; } +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; +#endif m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters @@ -290,8 +294,10 @@ namespace mg5amcCpu throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); m_gpublocks = gpublocks; m_gputhreads = gputhreads; +#ifdef MGONGPUCPP_VERBOSE std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; +#endif m_pmek->setGrid( m_gpublocks, m_gputhreads ); } #endif @@ -347,7 +353,9 @@ namespace mg5amcCpu if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); copyHostFromDevice( m_hstMEs, m_devMEs ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif copyHostFromDevice( m_hstSelHel, m_devSelHel ); copyHostFromDevice( m_hstSelCol, m_devSelCol ); if constexpr( std::is_same_v ) @@ -400,7 +408,9 @@ namespace mg5amcCpu } if( goodHelOnly ) return; m_pmek->computeMatrixElements( useChannelIds ); +#ifdef MGONGPUCPP_VERBOSE flagAbnormalMEs( m_hstMEs.data(), m_nevt ); +#endif if constexpr( std::is_same_v ) { memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h index 1afb14d668..8a37d1f947 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -1,17 +1,23 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 +#include "mgOnGpuConfig.h" + #include //-------------------------------------------------------------------------- #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#ifndef MGONGPU_HAS_NO_BLAS +#include "cublas_v2.h" +#endif + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -21,24 +27,61 @@ #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice cudaSetDevice #define gpuDeviceSynchronize cudaDeviceSynchronize #define gpuDeviceReset cudaDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t cudaStream_t +#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) + +#define gpuBlasStatus_t cublasStatus_t +#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t cublasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate cublasCreate +#define gpuBlasDestroy cublasDestroy +#define gpuBlasSetStream cublasSetStream + +#define gpuBlasSaxpy cublasSaxpy +#define gpuBlasSdot cublasSdot +#define gpuBlasSgemv cublasSgemv +#define gpuBlasSgemm cublasSgemm +#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched +#define gpuBlasDaxpy cublasDaxpy +#define gpuBlasDdot cublasDdot +#define gpuBlasDgemv cublasDgemv +#define gpuBlasDgemm cublasDgemm +#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched +#define GPUBLAS_OP_N CUBLAS_OP_N +#define GPUBLAS_OP_T CUBLAS_OP_T //-------------------------------------------------------------------------- #elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_BLAS +#include "hipblas/hipblas.h" +#endif + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString @@ -48,22 +91,69 @@ #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) ) #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) #define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) +#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) ) + #define gpuSetDevice hipSetDevice #define gpuDeviceSynchronize hipDeviceSynchronize #define gpuDeviceReset hipDeviceReset #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) -#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_> +#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<>>( __VA_ARGS__ ) + +#define gpuStream_t hipStream_t +#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) +#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) + +#define gpuBlasStatus_t hipblasStatus_t +#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#ifndef MGONGPU_HAS_NO_BLAS +#define gpuBlasHandle_t hipblasHandle_t +#else +#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds +#endif +#define gpuBlasCreate hipblasCreate +#define gpuBlasDestroy hipblasDestroy +#define gpuBlasSetStream hipblasSetStream + +#define gpuBlasSaxpy hipblasSaxpy +#define gpuBlasSdot hipblasSdot +#define gpuBlasSgemv hipblasSgemv +#define gpuBlasSgemm hipblasSgemm +#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched +#define gpuBlasDaxpy hipblasDaxpy +#define gpuBlasDdot hipblasDdot +#define gpuBlasDgemv hipblasDgemv +#define gpuBlasDgemm hipblasDgemm +#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched +#define GPUBLAS_OP_N HIPBLAS_OP_N +#define GPUBLAS_OP_T HIPBLAS_OP_T + +#endif //-------------------------------------------------------------------------- +#ifdef MGONGPU_FPTYPE2_FLOAT +#define gpuBlasTaxpy gpuBlasSaxpy +#define gpuBlasTdot gpuBlasSdot +#define gpuBlasTgemv gpuBlasSgemv +#define gpuBlasTgemm gpuBlasSgemm +#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched +#else +#define gpuBlasTaxpy gpuBlasDaxpy +#define gpuBlasTdot gpuBlasDdot +#define gpuBlasTgemv gpuBlasDgemv +#define gpuBlasTgemm gpuBlasDgemm +#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched #endif #endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h index 860c7fde16..086aa6a616 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 @@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS +#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); } +inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != GPUBLAS_STATUS_SUCCESS ) + { + printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line ); + if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS ); + } +} +#endif +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -50,7 +66,7 @@ namespace mg5amcGpu // Set up CUDA application // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) + static void setUp( const bool debug = false ) // ZW: changed debug default to false { // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! @@ -71,7 +87,7 @@ namespace mg5amcGpu // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) + static void tearDown( const bool debug = false ) // ZW: changed debug default to false { if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; checkGpu( gpuDeviceReset() ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index f463977c1a..a68ae314eb 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -60,7 +60,9 @@ namespace mg5amcCpu #ifdef MGONGPU_CHANNELID_DEBUG MatrixElementKernelBase::dumpNevtProcessedByChannel(); #endif +#ifdef MGONGPUCPP_VERBOSE MatrixElementKernelBase::dumpSignallingFPEs(); +#endif } //-------------------------------------------------------------------------- @@ -164,7 +166,7 @@ namespace mg5amcCpu , m_denominators( nevt ) #endif { - //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl; + //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl; if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" ); @@ -191,14 +193,14 @@ namespace mg5amcCpu MatrixElementKernelHost::~MatrixElementKernelHost() { + //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl; } //-------------------------------------------------------------------------- int MatrixElementKernelHost::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - HostBufferHelicityMask hstIsGoodHel( ncomb ); + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -206,7 +208,7 @@ namespace mg5amcCpu #else sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); #endif - // ... 0d2. Copy back good helicity list to static memory on the host + // ... 0d2. Copy good helicity list to static memory on the host // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] return sigmaKin_setGoodHel( hstIsGoodHel.data() ); } @@ -218,10 +220,10 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); #else assert( useChannelIds == false ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -312,16 +314,28 @@ namespace mg5amcGpu : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) , NumberOfEvents( gpublocks * gputhreads ) , m_couplings( this->nevt() ) + , m_pHelMEs() + , m_pHelJamps() + , m_pHelWfs() #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( this->nevt() ) - , m_denominators( this->nevt() ) + , m_pHelNumerators() + , m_pHelDenominators() + , m_colJamp2s( CPPProcess::ncolor * this->nevt() ) #endif #ifdef MGONGPU_CHANNELID_DEBUG , m_hstChannelIds( this->nevt() ) #endif +#ifndef MGONGPU_HAS_NO_BLAS + , m_blasColorSum( false ) + , m_blasTf32Tensor( false ) + , m_pHelBlasTmp() + , m_helBlasHandles() +#endif + , m_helStreams() , m_gpublocks( gpublocks ) , m_gputhreads( gputhreads ) { + //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl; if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?! @@ -339,12 +353,82 @@ namespace mg5amcGpu sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; throw std::runtime_error( sstr.str() ); } + // Create the "one-helicity" jamp buffer that will be used for helicity filtering + m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); + // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering + m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); +#endif + // Decide at runtime whether to use BLAS for color sums + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + static bool first = true; + if( first ) + { + first = false; + // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM + const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" ); + if( blasEnv && std::string( blasEnv ) != "" ) + { +#ifndef MGONGPU_HAS_NO_BLAS + m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices? + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl; +#else + throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" ); +#endif + } + else + { +#ifndef MGONGPU_HAS_NO_BLAS + std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl; +#else + std::cout << "INFO: BLAS was disabled at build time" << std::endl; +#endif + } +#ifndef MGONGPU_HAS_NO_BLAS +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR + const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" ); + if( blasEnv2 && std::string( blasEnv2 ) != "" ) + { + if( m_blasColorSum ) + { +#ifdef MGONGPU_FPTYPE2_FLOAT + m_blasTf32Tensor = true; + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl; +#else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl; +#endif + } + else + std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl; + } +#ifdef MGONGPU_FPTYPE2_FLOAT + else + { + if( m_blasColorSum ) + std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl; + } +#endif +#endif +#endif + } } //-------------------------------------------------------------------------- MatrixElementKernelDevice::~MatrixElementKernelDevice() { + //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl; + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { +#ifndef MGONGPU_HAS_NO_BLAS + if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] ); +#endif + if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr + } } //-------------------------------------------------------------------------- @@ -361,21 +445,61 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations - PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); - DeviceBufferHelicityMask devIsGoodHel( ncomb ); - // ... 0d1. Compute good helicity mask on the device + PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + // ... 0d1. Compute good helicity mask (a host variable) on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + const int nevt = m_gpublocks * m_gputhreads; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt ); #endif - checkGpu( gpuPeekAtLastError() ); - // ... 0d2. Copy back good helicity mask to the host - copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); - // ... 0d3. Copy back good helicity list to constant memory on the device - return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + // ... 0d3. Set good helicity list in host static memory + int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() ); + assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity + // Create one GPU stream for each good helicity + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + gpuStreamCreate( &m_helStreams[ighel] ); +#ifndef MGONGPU_HAS_NO_BLAS + // Create one cuBLAS/hipBLAS handle for each good helicity + // Attach a different stream to each cuBLAS/hipBLAS handle + if( m_blasColorSum ) + for( int ighel = 0; ighel < nGoodHel; ighel++ ) + { + checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) ); + checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) + if( m_blasTf32Tensor ) + checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores +#endif + } +#endif + // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); + // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) + // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); +#endif +#ifndef MGONGPU_HAS_NO_BLAS + // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) ); +#else + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity + if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) ); +#endif +#endif + // Return the number of good helicities + return nGoodHel; } //-------------------------------------------------------------------------- @@ -383,17 +507,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds ) { gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); -#ifndef MGONGPU_NSIGHT_DEBUG - constexpr unsigned int sharedMemSize = 0; +#ifndef MGONGPU_HAS_NO_BLAS + fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr ); + gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr ); #else - constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); + fptype2* ghelAllBlasTmp = nullptr; + gpuBlasHandle_t* ghelBlasHandles = nullptr; #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads ); #endif #ifdef MGONGPU_CHANNELID_DEBUG //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl; @@ -401,8 +527,8 @@ namespace mg5amcGpu const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr ); MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); // is this needed? + checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places... } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h index 7acff4b308..c901874333 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -1,16 +1,19 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include +#include #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -134,7 +137,7 @@ namespace mg5amcCpu // Does this host system support the SIMD used in the matrix element calculation? // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] - static bool hostSupportsSIMD( const bool verbose = true ); + static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false private: @@ -191,12 +194,24 @@ namespace mg5amcCpu // The buffer for the event-by-event couplings that depends on alphas QCD DeviceBufferCouplings m_couplings; + // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelMEs; + + // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelJamps; + + // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelWfs; + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - DeviceBufferNumerators m_numerators; + // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelNumerators; - // The buffer for the event-by-event denominators of multichannel factors - DeviceBufferDenominators m_denominators; + // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime) + std::unique_ptr m_pHelDenominators; + + // The super-buffer of ncolor jamp2 buffers + DeviceBufferSimple m_colJamp2s; #endif #ifdef MGONGPU_CHANNELID_DEBUG @@ -205,6 +220,23 @@ namespace mg5amcCpu PinnedHostBufferChannelIds m_hstChannelIds; #endif +#ifndef MGONGPU_HAS_NO_BLAS + // Decide at runtime whether to use BLAS for color sums + bool m_blasColorSum; + + // Decide at runtime whether TF32TENSOR math should be used in cuBLAS + bool m_blasTf32Tensor; + + // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers + std::unique_ptr m_pHelBlasTmp; + + // The array of cuBLAS/hipBLAS handles (one for each good helicity) + gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) +#endif + + // The array of GPU streams (one for each good helicity) + gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used) + // The number of blocks in the GPU grid size_t m_gpublocks; diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h index 0d92f69c43..a49f041e05 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -10,10 +10,6 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_AMPLITUDES 1 - // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -23,120 +19,11 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // A class describing the internal layout of memory buffers for amplitudes - // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessAmplitudesBase //_AOSOAv1 - { - public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) - { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } - }; - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + // A class providing trivial access to amplitude memory buffers template class KernelAccessAmplitudes { public: - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { @@ -148,8 +35,6 @@ namespace mg5amcCpu { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES }; //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h index 3802fa57c0..26345d4b43 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -235,7 +235,7 @@ namespace mg5amcCpu /* fptype_sv& real = kernelAccessIx2( buffer, 0 ); fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv_ref( real, imag ); */ return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), @@ -250,7 +250,7 @@ namespace mg5amcCpu /* const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); return cxtype_sv( real, imag ); */ return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h index 9f4c620bc7..bbffc1fb36 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -10,9 +10,7 @@ #include "mgOnGpuCxtypes.h" -#include "MemoryAccessHelpers.h" - -#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 +#include "CPPProcess.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL @@ -23,147 +21,44 @@ namespace mg5amcCpu { //---------------------------------------------------------------------------- -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // A class describing the internal layout of memory buffers for wavefunctions - // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW - // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] - class MemoryAccessWavefunctionsBase //_AOSOAv1 +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessWavefunctions { public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - - private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return reinterpret_cast( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 ); } }; +#endif //---------------------------------------------------------------------------- - // A class providing access to memory buffers for a given event, based on explicit event numbers - // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations - class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase - { - public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; - }; - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - //---------------------------------------------------------------------------- - - // A class providing access to memory buffers for a given event, based on implicit kernel rules - // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations - template - class KernelAccessWavefunctions + class HostAccessWavefunctions { public: - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; - -#else - static __host__ __device__ inline cxtype_sv* kernelAccess( fptype* buffer ) { return reinterpret_cast( buffer ); } - static __host__ __device__ inline const cxtype_sv* kernelAccessConst( const fptype* buffer ) { return reinterpret_cast( buffer ); } - -#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS }; //---------------------------------------------------------------------------- - typedef KernelAccessWavefunctions HostAccessWavefunctions; - typedef KernelAccessWavefunctions DeviceAccessWavefunctions; - - //---------------------------------------------------------------------------- - } // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h index 5bd3053393..0ddc356e1a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -34,6 +34,7 @@ namespace mg5amcCpu static constexpr size_t nparf = CPPProcess::nparf; static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + static constexpr size_t ncolor = CPPProcess::ncolor; } //-------------------------------------------------------------------------- @@ -69,8 +70,8 @@ namespace mg5amcCpu protected: BufferBase( const size_t size, const bool onDevice ) : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} - virtual ~BufferBase() {} public: + virtual ~BufferBase() {} T* data() { return m_data; } const T* data() const { return m_data; } T& operator[]( const size_t index ) { return m_data[index]; } @@ -167,8 +168,14 @@ namespace mg5amcCpu public: HostBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , HostBufferBase( sizePerEvent * nevt ) {} - virtual ~HostBuffer() {} + , HostBufferBase( sizePerEvent * nevt ) + { + //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~HostBuffer() + { + //std::cout << "HostBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif @@ -194,19 +201,33 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template - class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + class DeviceBuffer : public DeviceBufferBase, virtual protected NumberOfEvents { public: DeviceBuffer( const size_t nevt ) : NumberOfEvents( nevt ) - , DeviceBufferBase( sizePerEvent * nevt ) {} - virtual ~DeviceBuffer() {} + , DeviceBufferBase( sizePerEvent * nevt ) + { + //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl; + } + virtual ~DeviceBuffer() + { + //std::cout << "DeviceBuffer::dtor " << this << std::endl; + } virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } }; #endif //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis + typedef DeviceBuffer DeviceBufferSimple; + typedef DeviceBuffer DeviceBufferSimple2; +#endif + + //-------------------------------------------------------------------------- + // A base class encapsulating a memory buffer for momenta random numbers typedef BufferBase BufferRndNumMomenta; @@ -277,12 +298,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventNumerators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for numerators typedef HostBuffer HostBufferNumerators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for numerators typedef PinnedHostBuffer PinnedHostBufferNumerators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for numerators typedef DeviceBuffer DeviceBufferNumerators; #endif #endif @@ -297,12 +318,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventDenominators = 1; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for denominators typedef HostBuffer HostBufferDenominators; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for denominators typedef PinnedHostBuffer PinnedHostBufferDenominators; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for denominators typedef DeviceBuffer DeviceBufferDenominators; #endif #endif @@ -316,12 +337,12 @@ namespace mg5amcCpu constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; #ifndef MGONGPUCPP_GPUIMPL - // A class encapsulating a C++ host buffer for gs + // A class encapsulating a C++ host buffer for couplings typedef HostBuffer HostBufferCouplings; #else - // A class encapsulating a CUDA pinned host buffer for gs + // A class encapsulating a CUDA pinned host buffer for couplings typedef PinnedHostBuffer PinnedHostBufferCouplings; - // A class encapsulating a CUDA device buffer for gs + // A class encapsulating a CUDA device buffer for couplings typedef DeviceBuffer DeviceBufferCouplings; #endif @@ -505,6 +526,16 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifdef MGONGPUCPP_GPUIMPL + // The size (number of elements) per event in a memory buffer for jamps + constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2; + + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferJamps; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 6867c6d67d..d79ea62148 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "GpuRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessChannelIds.h" @@ -25,6 +26,7 @@ #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" +#include "color_sum.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -96,20 +98,16 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { - constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nwf = CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) - // The number of colors - constexpr int ncolor = 2; - - // The number of SIMD vectors of events processed by calculate_wavefunction + // The number of SIMD vectors of events processed by calculate_jamps #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT constexpr int nParity = 2; #else @@ -168,57 +166,112 @@ namespace mg5amcCpu // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; - __device__ __constant__ int cGoodHel[ncomb]; + __device__ __constant__ int dcNGoodHel; + __device__ __constant__ int dcGoodHel[ncomb]; #else static short cHel[ncomb][npar]; +#endif static int cNGoodHel; static int cGoodHel[ncomb]; + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp2 + { + public: + static __device__ inline fptype& + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + static __device__ inline const fptype& + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + return buffer[icol * nevt + ievt]; + } + }; #endif //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) - // In CUDA, this device function computes the ME for a single event - // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) - // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 *** - __device__ INLINE void /* clang-format off */ - calculate_wavefunctions( int ihel, - const fptype* allmomenta, // input: momenta[nevt*npar*4] - const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] - fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPUCPP_GPUIMPL + __device__ INLINE unsigned int + gpu_channelId( const unsigned int* allChannelIds ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } #endif - fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL - , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + return channelId; + } +#endif + + //-------------------------------------------------------------------------- + +#include "diagrams.h" + + //-------------------------------------------------------------------------- + + // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams + // Also compute running sums over helicities adding jamp2, numerator, denominator + // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel) + // In CUDA, this function processes a single event + // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function) + // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input) + // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allJamps, // output: jamp[ncolor*2*nevt] for this helicity + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + gpuStream_t gpustream, // input: cuda stream for this helicity + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + INLINE void + calculate_jamps( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + cxtype_sv* jamp_sv, // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00 + fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel + fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel +#endif + const int ievt00 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif - ) //ALWAYS_INLINE // attributes are not permitted in a function definition { #ifdef MGONGPUCPP_GPUIMPL - using namespace mg5amcGpu; - using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events - using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events #endif #else - using namespace mg5amcCpu; using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events - using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events - using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event - using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -226,294 +279,139 @@ namespace mg5amcCpu using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events #endif #endif /* clang-format on */ - mgDebug( 0, __FUNCTION__ ); - //bool debug = true; -#ifndef MGONGPUCPP_GPUIMPL - //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831 - //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); -#endif - //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel ); - - // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here - // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... - static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // ---------------------------- + // --- WAVEFUNCTION BUFFERS --- + // ---------------------------- +#ifndef MGONGPUCPP_GPUIMPL // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] - // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need - // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! - //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) - cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram - - // Proof of concept for using fptype* in the interface - fptype* w_fp[nwf]; - for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); - fptype* amp_fp; - amp_fp = reinterpret_cast( amp_sv ); - - // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) - // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] - cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + fptype* wfs = reinterpret_cast( w_sv ); +#else + // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt) + // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting + fptype* wfs = allWfs; +#endif // === Calculate wavefunctions and amplitudes for all diagrams in all processes === // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Mixed fptypes #537: float for color algebra and double elsewhere - // Delay color algebra and ME updates (only on even pages) - cxtype_sv jamp_sv_previous[ncolor] = {}; - fptype* MEs_previous = 0; -#endif + + // ***************************** + // *** START LOOP ON IPARITY *** + // ***************************** for( int iParity = 0; iParity < nParity; ++iParity ) - { // START LOOP ON IPARITY + { #ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif - //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823) - constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823) - const fptype* allCOUPs[nxcoup]; -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic push -#pragma nv_diag_suppress 186 // e.g. <> -#endif - for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events + + // ----------------- + // --- COUPLINGS --- + // ----------------- #ifdef MGONGPUCPP_GPUIMPL -#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#pragma nv_diagnostic pop -#endif - // CUDA kernels take input/output buffers with momenta/MEs for all events - const fptype* momenta = allmomenta; - const fptype* COUPs[nxcoup]; - for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; - fptype* MEs = allMEs; -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; - fptype* denominators = allDenominators; -#endif + // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events + const fptype* couplings = allcouplings; #else - // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) - const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; const fptype* COUPs[nxcoup]; + // Dependent couplings, vary event-by-event for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) - COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event - //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823 - for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 - COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events - fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); - fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); -#endif -#endif - - // Reset color flows (reset jamp_sv) at the beginning of a new event or event page - for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); - fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; #endif - // *** DIAGRAM 1 OF 3 *** - - // Wavefunction(s) for diagram number 1 - vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); - - vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); - - oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); - - ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); - - // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // --------------- + // --- MOMENTA --- + // --------------- +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with momenta for all events + const fptype* momenta = allmomenta; +#else + // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); #endif - jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; - jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; - - // *** DIAGRAM 2 OF 3 *** - - // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &_fp[0] ); -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) + // ------------- + // --- JAMPS --- + // ------------- + // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel) +#ifdef MGONGPUCPP_GPUIMPL + // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = allJamps; +#else + // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument + // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol]) + fptype* jamps = reinterpret_cast( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) ); #endif - jamp_sv[0] -= amp_sv[0]; - // *** DIAGRAM 3 OF 3 *** - - // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); - - // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &_fp[0] ); + // ------------------ + // --- CHANNELIDS --- + // ------------------ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) -#endif - jamp_sv[1] -= amp_sv[0]; - - // *** COLOR CHOICE BELOW *** - // Store the leading color flows for choice of color - if( jamp2_sv ) // disable color choice if nullptr - for( int icol = 0; icol < ncolor; icol++ ) - jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831 - - // *** COLOR MATRIX BELOW *** - // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) - - // The color denominators (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] - - // The color matrix (initialize all array elements, with ncolor=2) - // [NB do keep 'static' for these constexpr arrays, see issue #283] - static constexpr fptype2 cf[ncolor][ncolor] = { - { 16, -2 }, - { -2, 16 } }; // 2-D array[2][2] - -#ifndef MGONGPUCPP_GPUIMPL - // Pre-compute a constexpr triangular color matrix properly normalized #475 - struct TriangularNormalizedColorMatrix - { - // See https://stackoverflow.com/a/34465458 - __host__ __device__ constexpr TriangularNormalizedColorMatrix() - : value() - { - for( int icol = 0; icol < ncolor; icol++ ) - { - // Diagonal terms - value[icol][icol] = cf[icol][icol] / denom[icol]; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; - } - } - fptype2 value[ncolor][ncolor]; - }; - static constexpr auto cf2 = TriangularNormalizedColorMatrix(); -#endif - -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages - { - // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV - for( int icol = 0; icol < ncolor; icol++ ) - jamp_sv_previous[icol] = jamp_sv[icol]; - MEs_previous = MEs; - continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages - } - fptype_sv deltaMEs_previous = { 0 }; -#endif - - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - // Sum and square the color flows to get the matrix element - // (compute |M|^2 by squaring |M|, taking into account colours) - fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes - - // Use the property that M is a real matrix (see #475): - // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB - // In addition, on C++ use the property that M is symmetric (see #475), - // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: - // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. - // Strangely, CUDA is slower instead, so keep the old implementation for the moment. -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv jampR_sv[ncolor] = { 0 }; - fptype2_sv jampI_sv[ncolor] = { 0 }; - for( int icol = 0; icol < ncolor; icol++ ) - { - jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); - jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); - } -#endif - for( int icol = 0; icol < ncolor; icol++ ) - { - //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol ); -#ifndef MGONGPUCPP_GPUIMPL - // === C++ START === - // Diagonal terms -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRi_sv = jampR_sv[icol]; - fptype2_sv& jampIi_sv = jampI_sv[icol]; +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with channelIDs for all events + const unsigned int* channelIds = allChannelIds; #else - fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); - fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); + // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector + const unsigned int* channelIds = &channelId; #endif - fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; - fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; - // Off-diagonal terms - for( int jcol = icol + 1; jcol < ncolor; jcol++ ) - { -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype2_sv& jampRj_sv = jampR_sv[jcol]; - fptype2_sv& jampIj_sv = jampI_sv[jcol]; #else - fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); - fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + const unsigned int* channelIds = nullptr; #endif - ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; - ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; - } - fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - deltaMEs_previous += fpvsplit0( deltaMEs2 ); - deltaMEs += fpvsplit1( deltaMEs2 ); + + // ------------------------------- + // --- NUMERATORS/DENOMINATORS --- + // ------------------------------- +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#ifdef MGONGPUCPP_GPUIMPL + // CUDA diagram kernels take input/output buffers with numerators/denominators for all events + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; #else - deltaMEs += deltaMEs2; + // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif - // === C++ END === #else - // === CUDA START === - fptype2_sv ztempR_sv = { 0 }; - fptype2_sv ztempI_sv = { 0 }; - for( int jcol = 0; jcol < ncolor; jcol++ ) - { - fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); - fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); - ztempR_sv += cf[icol][jcol] * jampRj_sv; - ztempI_sv += cf[icol][jcol] * jampIj_sv; - } - deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; - // === CUDA END === + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check + fptype* numerators = nullptr; + fptype* denominators = nullptr; #endif - } - // *** STORE THE RESULTS *** + // ------------------------ + // --- FEYNMAN DIAGRAMS --- + // ------------------------ - // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv += deltaMEs; // fix #435 -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); - MEs_sv_previous += deltaMEs_previous; -#endif - /* + // *** DIAGRAMS 1 TO 3 *** #ifdef MGONGPUCPP_GPUIMPL - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); -#else -#ifdef MGONGPU_CPPSIMD - if( cNGoodHel > 0 ) - for( int ieppV = 0; ieppV < neppV; ieppV++ ) - printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); + gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel ); + gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); + gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators ); #else - if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); -#endif + diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel ); + diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators ); + diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators ); #endif - */ - } // END LOOP ON IPARITY - mgDebug( 1, __FUNCTION__ ); + } + // ***************************** + // *** END LOOP ON IPARITY *** + // ***************************** + return; } @@ -552,7 +450,11 @@ namespace mg5amcCpu #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif - fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions + + // Enable SIGFPE traps for Floating Point Exceptions +#ifdef MGONGPUCPP_DEBUG + fpeEnable(); +#endif } //-------------------------------------------------------------------------- @@ -585,6 +487,10 @@ namespace mg5amcCpu m_masses.push_back( m_pars->ZERO ); m_masses.push_back( m_pars->mdl_MT ); m_masses.push_back( m_pars->mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; @@ -625,6 +531,10 @@ namespace mg5amcCpu m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); +#ifdef MGONGPUCPP_GPUIMPL + // Create the normalized color matrix in device memory + createNormalizedColorMatrix(); +#endif } #endif @@ -727,26 +637,26 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; - using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, bsmIndepParam ); + using CD_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; - using C_ACCESS = HostAccessCouplings; + using CD_ACCESS = HostAccessCouplings; for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) { const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, bsmIndepParam ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void +#ifdef MGONGPUCPP_GPUIMPL + void /* clang-format off */ sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities @@ -754,25 +664,40 @@ namespace mg5amcCpu fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* allJamps, // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop) + fptype* allWfs, // tmp: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - host array + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { /* clang-format on */ + const int maxtry0 = 16; + fptype hstMEs[maxtry0]; + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt]; + ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection + } + // Event-by-event random choice of helicity #403 + //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < dcNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) ) + { + const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } + return; + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity + fptype* colAllJamp2s ) // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) + { + using J_ACCESS = DeviceAccessJamp; + using J2_ACCESS = DeviceAccessJamp2; + for( int icol = 0; icol < ncolor; icol++ ) + // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream! + atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) ); + } +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __global__ void + select_col( int* allselcol, // output: color selection[nevt] + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( allChannelIds ); + // Event-by-event random choice of color #402 + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...) + fptype_sv jamp2_sv[ncolor] = { 0 }; + assert( allJamp2s != nullptr ); // sanity check + using J2_ACCESS = DeviceAccessJamp2; + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC ); + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 ); + break; + } + } + } + else + { + allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + } + return; + } +#endif +#endif + + //-------------------------------------------------------------------------- // Evaluate |M|^2, part independent of incoming flavour - __global__ void /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) +#endif + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads +#else + void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL - , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities #endif - ) /* clang-format on */ + const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + ) +#endif /* clang-format on */ { mgDebugInitialise(); @@ -917,13 +1031,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL - // Remember: in CUDA this is a kernel for one event, in c++ this processes n events - const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events @@ -935,17 +1043,20 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - #include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // === PART 0 - INITIALISATION (before calculate_jamps) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] = 0; + const int nevt = gpublocks * gputhreads; + gpuMemset( allMEs, 0, nevt * sizeof( fptype ) ); + gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - allNumerators[ievt] = 0; - allDenominators[ievt] = 0; + gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif + gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); #else const int npagV = nevt / neppV; for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -971,93 +1082,63 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) - const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) - // NB: channelIds_sv is a scalar in CUDA - channelId = channelIds_sv; - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) - } -#endif - // Running sum of partial amplitudes squared for event by event color selection (#402) - // (for the single event processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) + // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity + // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); + calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads ); #endif - MEs_ighel[ighel] = allMEs[ievt]; } - // Event-by-event random choice of helicity #403 - //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) - { - const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] - allselhel[ievt] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); - break; - } - } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); - break; - } - } + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; + gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s ); } +#endif + // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps + if( !ghelBlasHandles ) + assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set else + assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { - allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931) + fptype* hAllMEs = ghelAllMEs + ighel * nevt; + fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2; +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0) +#else + fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr ); + if( hAllBlasTmp ) + gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...) +#endif +#ifndef MGONGPU_HAS_NO_BLAS + gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr ); +#else /* clang-format off */ + assert( ghelBlasHandles == nullptr ); // sanity check + gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds) +#endif /* clang-format on */ + color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads ); } + checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed + // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color + // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) + gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1099,7 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr if( allChannelIds != nullptr ) { @@ -1122,7 +1203,7 @@ namespace mg5amcCpu // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 for( int i = 0; i < neppV; ++i ) { assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector @@ -1131,25 +1212,31 @@ namespace mg5amcCpu } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) - // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) - fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; - fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) + fptype_sv jamp2_sv[nParity * ncolor] = {}; + fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page) #endif for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { const int ihel = cGoodHel[ighel]; + cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); #else - calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif + color_sum_cpu( allMEs, jamp_sv, ievt00 ); MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); #endif + using J_ACCESS = HostAccessJamp; + for( int iParity = 0; iParity < nParity; ++iParity ) + for( int icol = 0; icol < ncolor; icol++ ) + jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831 } // Event-by-event random choice of helicity #403 for( int ieppV = 0; ieppV < neppV; ++ieppV ) @@ -1159,8 +1246,10 @@ namespace mg5amcCpu for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { #if defined MGONGPU_CPPSIMD + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); #else + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] ); const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); #endif if( okhel ) @@ -1176,11 +1265,12 @@ namespace mg5amcCpu //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); for( int ighel = 0; ighel < cNGoodHel; ighel++ ) { + //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] ); if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) { const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] allselhel[ievt2] = ihelF; - //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF ); break; } } @@ -1282,14 +1372,15 @@ namespace mg5amcCpu #endif // CUDA or C++ - // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // PART 2 - FINALISATION (after calculate_jamps) // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] #ifdef MGONGPUCPP_GPUIMPL - allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0') + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h index 24c27005b8..5acfd9f387 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -19,6 +19,7 @@ #include "mgOnGpuVectors.h" +#include "GpuAbstraction.h" #include "Parameters_MSSM_SLHA2.h" #include @@ -75,17 +76,17 @@ namespace mg5amcCpu static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu- + static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu- // Hardcoded parameters for this process (constant class variables) // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] - // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] - //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)] + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) // Other variables of this instance (???) //static const int ninitial = CPPProcess::npari; //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) - //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 3; //static const int ncomb = 16; // CPPProcess::ncomb @@ -122,23 +123,26 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif - bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) + fptype* allJamps, // output: jamp[ncolor*2*nevt] + fptype* allWfs, // output: wf[nwf*nw6*2*nevt] + bool* isGoodHel, // output: isGoodHel[ncomb] - device array (GPU device implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #else - __global__ void + void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) @@ -152,34 +156,46 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) +#endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities + int* allselhel, // output: helicity selection[nevt] #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + int* allselcol, // output: helicity selection[nevt] + fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities + fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) #endif - int* allselhel, // output: helicity selection[nevt] - int* allselcol // output: helicity selection[nevt] - ); + fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllJamps, // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype* ghelAllWfs, // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + fptype2* ghelAllBlasTmp, // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + gpuBlasHandle_t* ghelBlasHandles, // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null) + gpuStream_t* ghelStreams, // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null) + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads #else - __global__ void + void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] const fptype* allrndhel, // input: random numbers[nevt] for helicity selection - const fptype* allrndcol, // input: random numbers[nevt] for color selection - fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) - fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const unsigned int* allChannelIds, // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) #endif + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] + fptype* allNumerators, // tmp: numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // tmp: denominators[nevt], running_sum_over_helicities +#endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc new file mode 100644 index 0000000000..b96d73fb5c --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc @@ -0,0 +1,383 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#include "color_sum.h" + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessMatrixElements.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors + + //-------------------------------------------------------------------------- + + // *** COLOR MATRIX BELOW *** + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 colorMatrix[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifdef MGONGPUCPP_GPUIMPL + // The normalized color matrix (divide each column by denom) + template + struct NormalizedColorMatrix + { + constexpr __host__ __device__ NormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + for( int jcol = 0; jcol < ncolor; jcol++ ) + value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol]; + } + T value[ncolor * ncolor]; + }; + // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas) + static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor]; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix() + { + static bool first = true; + if( first ) + { + first = false; + constexpr NormalizedColorMatrix normalizedColorMatrix2; + gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ) // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) + { + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. + fptype_sv deltaMEs = { 0 }; +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv deltaMEs_next = { 0 }; + // Mixed mode: merge two neppV vectors into one neppV2 vector + fptype2_sv jampR_sv[ncolor]; + fptype2_sv jampI_sv[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) ); + } +#else + const cxtype_sv* jamp_sv = allJamp_sv; +#endif + // Loop over icol + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Loop over jcol + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { + // Off-diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs += fpvsplit0( deltaMEs2 ); + deltaMEs_next += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + } + // *** STORE THE RESULTS *** + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV ); + fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next ); + MEs_sv_next += deltaMEs_next; +#endif + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + __global__ void + color_sum_kernel( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + using J_ACCESS = DeviceAccessJamp; + fptype jampR[ncolor]; + fptype jampI[ncolor]; + for( int icol = 0; icol < ncolor; icol++ ) + { + cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol ); + jampR[icol] = jamp.real(); + jampI[icol] = jamp.imag(); + } + // Loop over icol + fptype deltaMEs = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + fptype2 ztempR = { 0 }; + fptype2 ztempI = { 0 }; + // Loop over jcol + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2 jampRj = jampR[jcol]; + fptype2 jampIj = jampI[jcol]; + ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix + ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix + } + deltaMEs += ztempR * jampR[icol]; + deltaMEs += ztempI * jampI[icol]; + } + // *** STORE THE RESULTS *** + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) + E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435 + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertD2F_Jamps( fptype2* allJampsFpt2, // output: jamp[ncolor*2*nevt] for one specific helicity + const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity + { + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied! + // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here + for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ ) + for( int icol = 0; icol < ncolor; icol++ ) + allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +#ifndef MGONGPU_HAS_NO_BLAS +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + __global__ void + convertF2D_MEs( fptype* allMEs, // output: allMEs[nevt] for one specific helicity + const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity + { + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + allMEs[ievt] = allMEsFpt2[ievt]; + } +#endif +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifndef MGONGPU_HAS_NO_BLAS + void + color_sum_blas( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#else + gpuStream_t /*stream*/, // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m) +#endif + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { + const int nevt = gpublocks * gputhreads; + + // Get the address associated with the normalized color matrix in device memory + static fptype2* devNormColMat = nullptr; + if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 ); + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity + fptype2* allZtempBoth = allBlasTmp; // start of first fptype2[ncolor*2*nevt] buffer + fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt; // start of second fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer + // Convert jamps from double to float + gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps ); + // Real and imaginary components + const fptype2* allJampsReal = allJampsFpt2; + const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt; +#else + static_assert( std::is_same::value ); // sanity check + // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer + fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer + fptype2* allMEsFpt2 = allMEs; + // Real and imaginary components + const fptype2* allJampsReal = allJamps; // this is not a cast (the two types are identical) + const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical) +#endif + // Real and imaginary components + fptype2* allZtempReal = allZtempBoth; + fptype2* allZtempImag = allZtempBoth + ncolor * nevt; + + // Note, new striding for cuBLAS from DeviceAccessJamp: + // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1" + // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1" + + // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag + // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp + fptype2 alpha1 = 1; + fptype2 beta1 = 0; + const int ncolorM = ncolor; + const int nevtN = nevt; + const int ncolorK = ncolor; + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsReal, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN + checkGpuBlas( gpuBlasTgemm( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose ColMat + GPUBLAS_OP_T, // transpose JampsV (new1) + ncolorM, nevtN, ncolorK, + &alpha1, + devNormColMat, ncolorM, // ColMat is ncolorM x ncolorK + allJampsImag, nevtN, // JampsV is nevtN x ncolorK (new1) + &beta1, + allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN + + // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt] + // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME + // Use cublasSgemmStridedBatched to perform these batched dot products in one call + fptype2 alpha2 = 1; + fptype2 beta2 = 1; + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsReal, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempReal, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle, + GPUBLAS_OP_N, // do not transpose JampsV (new1) + GPUBLAS_OP_N, // do not transpose Tmp + 1, 1, ncolor, // result is 1x1 (dot product) + &alpha2, + allJampsImag, nevt, 1, // allJamps is nevt x ncolor, stride 1 for each ievt column (new1) + allZtempImag, ncolor, ncolor, // allZtemp is ncolor x nevt, with stride ncolor for each ievt column + &beta2, + allMEsFpt2, 1, 1, // output is a 1x1 result for each "batch" (i.e. for each ievt) + nevt ) ); // there are nevt "batches" + +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Convert MEs from float to double + gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 ); +#endif + } +#endif /* clang-format on */ +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ) // input: cuda gputhreads + { +#ifdef MGONGPU_HAS_NO_BLAS + assert( allBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas + assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas +#endif + if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set + { + assert( allBlasTmp == nullptr ); + gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps ); + } +#ifndef MGONGPU_HAS_NO_BLAS + else + { + assert( allBlasTmp != nullptr ); + color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads ); + } +#endif + } +#endif + + //-------------------------------------------------------------------------- + +} // end namespace diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h new file mode 120000 index 0000000000..24b0157011 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h @@ -0,0 +1 @@ +../color_sum.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagram_boilerplate.h new file mode 120000 index 0000000000..e657b15c20 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagram_boilerplate.h @@ -0,0 +1 @@ +../diagram_boilerplate.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagrams.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagrams.h new file mode 100644 index 0000000000..faf1602413 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagrams.h @@ -0,0 +1,106 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +/* clang-format off */ + + //-------------------------------------------------------------------------- + + __global__ void + diagram1( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators, // input/output: multichannel denominators[nevtORneppV], add helicity ihel + const fptype* momenta, // input: momenta[npar*4*nevtORneppV] + const int ihel ) // input: helicity (0 to ncomb) + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" +#ifdef MGONGPUCPP_GPUIMPL + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events +#else + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events +#endif + // *** DIAGRAM 1 OF 3 *** + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0]; + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram2( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 2 OF 3 *** + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + + __global__ void + diagram3( fptype* wfs, // input/output wavefunctions[nwf*2*nw6*nevtORneppV] + fptype* jamps, // output jamps[ncolor*2*nevtORneppV] + const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE) +#ifdef MGONGPUCPP_GPUIMPL + const fptype* couplings, // input: dependent couplings[nevt*ndcoup*2] for all events +#else + const fptype** COUPs, // input: dependent and independent COUPs[nxcoup] for this event page +#endif + fptype* numerators, // input/output: multichannel numerators[nevtORneppV], add helicity ihel + fptype* denominators ) // input/output: multichannel denominators[nevtORneppV], add helicity ihel + { + // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL + // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check +#include "diagram_boilerplate.h" + // *** DIAGRAM 3 OF 3 *** + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0]; + } + + //-------------------------------------------------------------------------- + +/* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h new file mode 100644 index 0000000000..71b5b1eaad --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h @@ -0,0 +1,105 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +#ifndef COLOR_SUM_H +#define COLOR_SUM_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "CPPProcess.h" +#include "GpuAbstraction.h" + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + class DeviceAccessJamp + { + public: + static __device__ inline cxtype_ref + kernelAccessIcol( fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + static __device__ inline const cxtype + kernelAccessIcolConst( const fptype* buffer, const int icol ) + { + const int ncolor = CPPProcess::ncolor; // the number of leading colors + const int nevt = gridDim.x * blockDim.x; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; + // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last) + //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old" + // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last) + // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS + return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1" + // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last) + //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2" + } + }; +#else + class HostAccessJamp + { + public: + static inline cxtype_sv& + kernelAccessIcol( cxtype_sv* buffer, const int icol ) + { + return buffer[icol]; + } + static inline cxtype_sv& + kernelAccessIcol( fptype* buffer, const int icol ) + { + return reinterpret_cast( buffer )[icol]; + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void createNormalizedColorMatrix(); +#endif + + //-------------------------------------------------------------------------- + +#ifndef MGONGPUCPP_GPUIMPL + void + color_sum_cpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity + const int ievt0 ); // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL + void + color_sum_gpu( fptype* allMEs, // output: allMEs[nevt], add |M|^2 for this specific helicity + const fptype* allJamps, // input: jamp[ncolor*2*nevt] for one specific helicity + fptype2* allBlasTmp, // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity + gpuStream_t stream, // input: cuda stream (nullptr indicates the default stream) + gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle + const int gpublocks, // input: cuda gpublocks + const int gputhreads ); // input: cuda gputhreads +#endif + + //-------------------------------------------------------------------------- +} + +#endif // COLOR_SUM_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index 20d8ded718..e7360b29e2 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -114,7 +114,7 @@ export CXXFLAGS override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) # Set HIP_HOME from the path to hipcc, if it exists -override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) +override HIP_HOME = $(shell hipconfig --rocmpath) # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965) ifeq ($(CUDA_HOME),) @@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda) else ifeq ($(BACKEND),hip) + # example architecture values MI200:gfx90a, MI350X:gfx942 + MADGRAPH_HIP_ARCHITECTURE ?= gfx942 # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists) GPUCC = $(HIP_HOME)/bin/hipcc XCOMPILERFLAG = @@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip) ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY # AMD HIP architecture flags - GPUARCHFLAGS = --offload-arch=gfx90a + GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE} GPUFLAGS += $(GPUARCHFLAGS) # Other AMD-specific flags @@ -477,6 +479,34 @@ endif #------------------------------------------------------------------------------- +#=== Configure defaults and check if user-defined choices exist for HASBLAS + +# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS + +ifeq ($(HASBLAS),) + ifeq ($(GPUCC),) # CPU-only build + override HASBLAS = hasNoBlas + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),) + # cuBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),) + # hipBLAS headers do not exist?? + override HASBLAS = hasNoBlas + else + override HASBLAS = hasBlas + endif + else + override HASBLAS = hasNoBlas + endif +endif + +#------------------------------------------------------------------------------- + #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS @@ -597,6 +627,30 @@ endif #$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) #$(info RNDLIBFLAGS=$(RNDLIBFLAGS)) +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS + +$(info HASBLAS=$(HASBLAS)) +override BLASCXXFLAGS= +override BLASLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas") +ifeq ($(HASBLAS),hasNoBlas) + override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS +else ifeq ($(HASBLAS),hasBlas) + ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas + endif +else + $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported) +endif +CXXFLAGS += $(BLASCXXFLAGS) +GPUFLAGS += $(BLASCXXFLAGS) + +#$(info BLASCXXFLAGS=$(BLASCXXFLAGS)) +#$(info BLASLIBFLAGS=$(BLASLIBFLAGS)) + #------------------------------------------------------------------------------- #=== Configure Position-Independent Code @@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif @@ -799,7 +853,7 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) - $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS) # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) # $(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs @@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif @@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin) $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif @@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN) $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS) $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64 + $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64 else $(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/diagram_boilerplate.h new file mode 100644 index 0000000000..96a34fb1bf --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/diagram_boilerplate.h @@ -0,0 +1,103 @@ +// Copyright (C) 2020-2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS + +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + + //------------- + // GPU only + //------------- + + //using namespace mg5amcGpu; + using W_ACCESS = DeviceAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = DeviceAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) + unsigned int channelId = gpu_channelId( channelIds ); +#endif + + // Wavefunctions + // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes + // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes + // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]') + // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6] + // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12 + // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect + // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced) + const int nevt = gridDim.x * blockDim.x; + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2; + + // Couplings + constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup) + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + // Dependent couplings, vary event-by-event + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup ); + // Independent couplings, fixed for all events + for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +#pragma nv_diagnostic pop +#endif + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + +#else + + //------------- + // C++ only + //------------- + + //using namespace mg5amcCpu; + using W_ACCESS = HostAccessWavefunctions; // non-trivial access (with kernel splitting): buffer includes all events + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (local variable for one event): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event + using J_ACCESS = HostAccessJamp; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events + // SCALAR channelId for the current SIMD event page (C++) + unsigned int channelId = *channelIds; +#endif + + // Wavefunctions + // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )" + fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs); + +#endif + + //------------- + // GPU or C++ + //------------- + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + fptype* amp_fp; // proof of concept for using fptype* in the interface + amp_fp = reinterpret_cast( amp_sv ); + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#else + assert( channelIds == nullptr ); + assert( numerators == nullptr ); + assert( denominators == nullptr ); +#endif /* clang-format on */ + +#pragma GCC diagnostic pop diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc index 4eec5db13c..216a90a302 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -22,6 +22,8 @@ #endif #include "epoch_process_id.h" +#include + #ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h index 9ed58e24f1..22a10cc1e3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h @@ -2,13 +2,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Sep 2010) for the MG5aMC backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -860,7 +860,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ INLINE void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -873,7 +873,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ INLINE void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -885,7 +885,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ INLINE void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -898,7 +898,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ INLINE void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -911,7 +911,7 @@ namespace mg5amcCpu //========================================================================== // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] - template + template __device__ void VVV1P0_1( const fptype allV2[], const fptype allV3[], @@ -924,7 +924,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); const cxtype cI = cxmake( 0., 1. ); const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; @@ -949,7 +949,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] - template + template __device__ void FFV1_0( const fptype allF1[], const fptype allF2[], @@ -962,7 +962,7 @@ namespace mg5amcCpu const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); const cxtype cI = cxmake( 0., 1. ); const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); @@ -974,7 +974,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] - template + template __device__ void FFV1_1( const fptype allF2[], const fptype allV3[], @@ -987,7 +987,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); const cxtype cI = cxmake( 0., 1. ); F1[0] = +F2[0] + V3[0]; @@ -1006,7 +1006,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] - template + template __device__ void FFV1_2( const fptype allF1[], const fptype allV3[], @@ -1019,7 +1019,7 @@ namespace mg5amcCpu mgDebug( 0, __FUNCTION__ ); const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); - const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP ); cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); const cxtype cI = cxmake( 0., 1. ); F2[0] = +F1[0] + V3[0]; diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index aa00d6a9e4..0a62a7059c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 3e29f2ccbe..2b51d933c5 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -1,13 +1,13 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30 +// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -878,7 +878,7 @@ namespace mg5amcCpu #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template + template __device__ inline void G2COUP( const fptype gs[], fptype couplings[], @@ -888,10 +888,10 @@ namespace mg5amcCpu using namespace Parameters_MSSM_SLHA2_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); - fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); - fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); - cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); - cxtype_sv_ref GC_51s_sv = C_ACCESS::kernelAccess( GC_51s ); + fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); + fptype* GC_51s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); + cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s ); + cxtype_sv_ref GC_51s_sv = CD_ACCESS::kernelAccess( GC_51s ); GC_6s_sv = couplings_sv.GC_6; GC_51s_sv = couplings_sv.GC_51; mgDebug( 1, __FUNCTION__ ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h index d3c4ca5695..8a2804aa04 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -74,6 +74,7 @@ #define MGONGPU_FPTYPE2_DOUBLE 1 // default //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster #endif + // Choose whether to inline all HelAmps functions // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS @@ -108,10 +109,23 @@ #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif +// Choose if cuBLAS and hipBLAS are supported for generating random numbers +// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS +// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?) +#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#elif defined __HIPCC__ +//#undef MGONGPU_HAS_NO_BLAS // default +////#define MGONGPU_HAS_NO_BLAS 1 +#else +#define MGONGPU_HAS_NO_BLAS 1 +#endif + // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) #undef MGONGPU_NSIGHT_DEBUG // default in CUDA -//#define MGONGPU_NSIGHT_DEBUG 1 +//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED! #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif /* clang-format on */ @@ -232,19 +246,19 @@ using mgOnGpu::fptype2; #endif /* clang-format off */ -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!] // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) -#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; -#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } -#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } -#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } -#else +//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL) +//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +//#else #define mgDebugDeclare() /*noop*/ -#define mgDebugInitialise() { /*noop*/ } -#define mgDebug( code, text ) { /*noop*/ } -#define mgDebugFinalise() { /*noop*/ } -#endif /* clang-format on */ +#define mgDebugInitialise() /*noop*/ +#define mgDebug( code, text ) /*noop*/ +#define mgDebugFinalise() /*noop*/ +//#endif /* clang-format on */ // Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h index 92d74fd6db..e98e925f2a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ -// Copyright (C) 2020-2024 CERN and UCLouvain. +// Copyright (C) 2020-2025 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -717,12 +717,24 @@ namespace mg5amcCpu : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; - } // copy (assign) non-const values + } + __host__ __device__ cxtype_ref& operator+=( const cxtype& c ) + { + *m_preal += cxreal( c ); + *m_pimag += cximag( c ); + return *this; + } + __host__ __device__ cxtype_ref& operator-=( const cxtype& c ) + { + *m_preal -= cxreal( c ); + *m_pimag -= cximag( c ); + return *this; + } __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype* const m_preal; // const pointer to non-const fptype R diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk index f703a1ae7c..1f9f8bbc46 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk @@ -1,7 +1,7 @@ -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. -# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin. +# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin. THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) @@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: - git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + git clone https://github.com/google/googletest.git -b v1.17.0 googletest googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt mkdir -p googletest/$(BUILDDIR) diff --git a/epochX/cudacpp/tmad/allTees.sh b/epochX/cudacpp/tmad/allTees.sh index eb39e2b302..89d91e7096 100755 --- a/epochX/cudacpp/tmad/allTees.sh +++ b/epochX/cudacpp/tmad/allTees.sh @@ -1,23 +1,41 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (May 2022) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. scrdir=$(cd $(dirname $0); pwd) host=$(hostname) if [ "${host/juwels}" != "${host}" ]; then ${scrdir}/juwelspatch.sh; fi # workaround for #498 +# Usage +function usage() +{ + echo "Usage (1): $0 [-short|-ggttggg] [-bsmonly|-nobsm] [-makeclean] [+10x] [-hip]" + echo "Run tests and check all logs" + echo "" + echo "Usage (2): $0 -checkonly" + echo "Check existing logs without running any tests" + exit 1 +} + +# Parse command line arguments +checkonly=0 short=0 bsm= flts=-dmf # "d m f" (alternative: -d_f i.e. "d f") makeclean= rmrdat= -add10x="+10x" +add10x= hip= - -while [ "$1" != "" ]; do +if [ "$1" == "-checkonly" ]; then + # Check existing logs without running any tests? + checkonly=1 + shift + if [ "$1" != "" ]; then usage; fi +fi +while [ "${checkonly}" == "0" ] && [ "$1" != "" ]; do if [ "$1" == "-short" ]; then short=1 # all (possibly including bsm) but ggttggg shift @@ -27,8 +45,8 @@ while [ "$1" != "" ]; do elif [ "$1" == "-makeclean" ]; then makeclean=$1 shift - elif [ "$1" == "-no10x" ]; then - add10x="" + elif [ "$1" == "+10x" ]; then + add10x=$1 shift elif [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then bsm=$1 @@ -40,42 +58,72 @@ while [ "$1" != "" ]; do hip=$1 shift else - echo "Usage: $0 [-short|-ggttggg] [-bsmonly|-nobsm] [-makeclean] [-no10x] [-hip]" - exit 1 + usage fi done -started="STARTED AT $(date)" - -if [ "${bsm}" != "-bsmonly" ]; then - if [ "$short" == "1" ]; then - ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq $flts $makeclean $rmrdat $add10x $hip - elif [ "$short" == "-1" ]; then - ${scrdir}/teeMadX.sh -ggttggg $flts $makeclean $rmrdat $add10x $hip - else - ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg $flts $makeclean $rmrdat $add10x $hip +# Run all tests +if [ "${checkonly}" == "0" ]; then + started="STARTED AT $(date)" + # SM tests + if [ "${bsm}" != "-bsmonly" ]; then + if [ "$short" == "1" ]; then + ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq $flts $makeclean $rmrdat $add10x $hip + elif [ "$short" == "-1" ]; then + ${scrdir}/teeMadX.sh -ggttggg $flts $makeclean $rmrdat $add10x $hip + else + ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg $flts $makeclean $rmrdat $add10x $hip + fi fi -fi -status=$? -ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]" - -if [ "${bsm}" != "-nobsm" ]; then - if [ "$short" != "-1" ]; then - ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x $hip + status=$? + ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]" + # BSM tests + if [ "${bsm}" != "-nobsm" ]; then + if [ "$short" != "-1" ]; then + ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x $hip + fi fi + status=$? + ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]" + # Timing information + echo + printf "\n%80s\n" |tr " " "#" + echo + echo -e "$started" + echo -e "$ended1" + echo -e "$ended2" + echo fi -status=$? -ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]" # Print out the number of "OK!"s in each log (expect 24) +for f in ${scrdir}/logs_*_mad/log_*; do echo $(cat $f | grep OK | wc -l) $f; done # expect 24 + +# Print out any errors or aborts in the logs echo -printf "\n%80s\n" |tr " " "#" +txt=$(egrep -i '(error|abort)' tmad/logs* -r | sed 's/:0:rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/' | sed "s/Gpu.*Assert/Assert/") +if [ "${txt}" == "" ]; then + echo "No errors or aborts found in logs" +else + echo "${txt}" +fi + +# Print out any asserts in the logs echo -echo -e "$started" -echo -e "$ended1" -echo -e "$ended2" +txt=$(grep assert tmad/logs* -r | sed "s/Gpu.*Assert/Assert/") +if [ "${txt}" == "" ]; then + echo "No asserts found in logs" +else + echo "${txt}" +fi + +# Print out any segfaults in the logs echo -for f in ${scrdir}/logs_*_mad/log_*; do echo $(cat $f | grep OK | wc -l) $f; done # expect 24 +txt=$(grep -i segmentation tmad/logs* -r | sed "s/Gpu.*Assert/Assert/") +if [ "${txt}" == "" ]; then + echo "No segmentation fault found in logs" +else + echo "${txt}" +fi # Print out the MEK channelid debugging output echo diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index c9c9460105..d835a5038f 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone - +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:04:14 +DATE: 2025-09-24_09:40:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7444s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7368s - [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7611s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7533s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2176s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2101s - [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2226s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2147s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2197s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2229s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2149s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.158620e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.104818e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.163690e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.106592e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.81E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2226s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2171s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.58E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.887925e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.682136e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.991506e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.752662e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2160s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2124s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2201s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2165s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.47E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.590914e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.307123e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.667984e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.575803e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2167s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.46E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2187s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2153s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.58E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.636316e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.703640e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.730901e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.854606e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2188s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2145s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 2.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2193s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2142s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 8192 events => throughput is 1.70E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.085135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.807316e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.218811e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.926326e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6526s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6492s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.88E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.6534s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6489s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.11E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0038s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -348,44 +355,44 @@ OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789448173971E-00 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.299210e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.643554e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.632885e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.396194e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.507229e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.031975e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.868548e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.648185e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.543060e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.033996e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.911449e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.933419e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.533062e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.053853e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.164979e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.238409e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 13ceac3a87..739cb36246 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:04:39 +DATE: 2025-09-24_09:40:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7443s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7370s - [COUNTERS] Fortran MEs ( 1 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7601s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7524s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2183s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2108s - [COUNTERS] Fortran MEs ( 1 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2228s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2150s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432777382586498E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432777448196335E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2266s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2197s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.21E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.2224s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2147s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0074s for 8192 events => throughput is 1.10E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432777382586498E-002) differ by less than 4E-4 (1.305336294610271e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432777448196335E-002) differ by less than 4E-4 (1.298238181401956e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.221258e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.105792e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225429e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149836e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432774839452045E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432774879426222E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2220s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2190s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2190s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.56E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774839452045E-002) differ by less than 4E-4 (1.5804696607002455e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774879426222E-002) differ by less than 4E-4 (1.5761449856377396e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.137547e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.737799e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.221144e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.881039e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432774837279630E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2228s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2200s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.13E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2197s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2177s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.49E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002) differ by less than 4E-4 (1.5721963908532643e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774837279630E-002) differ by less than 4E-4 (1.5807046871429975e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.328121e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.997328e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.556846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.092481e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432774837279630E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2241s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2195s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2175s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.62E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002) differ by less than 4E-4 (1.5721963908532643e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774837279630E-002) differ by less than 4E-4 (1.5807046871429975e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.452418e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.992748e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.604389e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.419746e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432778556608516E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432778581375011E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2173s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2209s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2182s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432778556608516E-002) differ by less than 4E-4 (1.1783227071848756e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432778581375011E-002) differ by less than 4E-4 (1.1756433015985834e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.402847e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.799518e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.641263e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.047783e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432780016531851E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432779972212775E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6500s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6467s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.92E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.6551s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6505s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0039s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432780016531851E-002) differ by less than 4E-4 (1.0203783951112655e-07) +OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432779972212775E-002) differ by less than 4E-4 (1.0251731308308365e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.451436e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.269152e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.688055e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.461078e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.014252e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.890598e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.229387e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.106832e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.787718e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.912802e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.220221e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096461e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.380548e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.776097e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.826286e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.098447e+08 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 093bec81e5..ee7fc520cb 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:04:27 +DATE: 2025-09-24_09:40:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3837 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7605s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7527s - [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7587s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7510s + [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2144s - [COUNTERS] Fortran MEs ( 1 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2225s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2147s + [COUNTERS] Fortran MEs ( 1 ) : 0.0078s for 8192 events => throughput is 1.04E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2212s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2242s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2159s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.133245e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069043e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.115304e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096393e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2168s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2223s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2170s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0050s for 8192 events => throughput is 1.64E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.993139e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.684928e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.058944e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852659e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2174s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2138s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.42E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2193s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2154s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.29E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.549665e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.593852e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.708708e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.824521e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2179s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.49E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2171s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2137s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.71E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.606715e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.864576e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.748967e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.168112e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.2165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2140s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 8192 events => throughput is 1.80E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.203720e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.878395e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.284212e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905957e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09243 [9.2432789437826970E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09243 [9.2432789437826984E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1589 events (found 1593 events) - [COUNTERS] PROGRAM TOTAL : 0.6505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6470s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.77E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.6576s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6531s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.09E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0038s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789437826970E-002) differ by less than 2E-4 (1.1194101201539297e-10) +OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789437826984E-002) differ by less than 2E-4 (1.1194078997078805e-10) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.269035e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635045e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.550305e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.290600e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.523745e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.038238e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.857337e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.623999e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.551254e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.057820e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.897534e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.893156e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.503798e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.068143e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.184430e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.257736e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 794f102690..3f2f6b7530 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx make USEBUILDDIR=1 BACKEND=cuda @@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:04:53 +DATE: 2025-09-24_09:40:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8494s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8073s - [COUNTERS] Fortran MEs ( 1 ) : 0.0421s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8577s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8141s + [COUNTERS] Fortran MEs ( 1 ) : 0.0436s for 8192 events => throughput is 1.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4510s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s - [COUNTERS] Fortran MEs ( 1 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4569s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4133s + [COUNTERS] Fortran MEs ( 1 ) : 0.0435s for 8192 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4555s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0453s for 8192 events => throughput is 1.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4659s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4181s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0474s for 8192 events => throughput is 1.73E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.856020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799101e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.865986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804128e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4352s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 8192 events => throughput is 3.34E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4444s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4166s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0274s for 8192 events => throughput is 2.99E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.314758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.072714e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.321531e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.083395e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4235s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4077s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.30E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4338s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4171s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0163s for 8192 events => throughput is 5.03E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.263509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.875847e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.327379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.001397e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4237s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4087s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0147s for 8192 events => throughput is 5.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4322s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4161s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0157s for 8192 events => throughput is 5.21E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.648502e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.170619e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.831851e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.464737e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4071s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0222s for 8192 events => throughput is 3.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4392s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4144s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.36E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034169) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.526689e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.424710e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.574003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.469326e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8534s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8496s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.68E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8664s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8607s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.31E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0044s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cuda (47.138611968034176) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.138611968034176) and cuda (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.103830e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.022933e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.448285e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.070706e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.875229e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.914638e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.627647e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036305e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.886865e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.920677e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.006782e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071087e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.862106e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.917421e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.715892e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621738e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 2bf2a37cc7..0353138ae2 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx make USEBUILDDIR=1 BACKEND=cuda @@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:05:22 +DATE: 2025-09-24_09:41:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8450s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8027s - [COUNTERS] Fortran MEs ( 1 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8551s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8118s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4094s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4590s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4153s + [COUNTERS] Fortran MEs ( 1 ) : 0.0437s for 8192 events => throughput is 1.87E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138606099989779] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4548s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4118s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4597s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4149s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0444s for 8192 events => throughput is 1.84E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487851646206e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487873850667e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.973574e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883634e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.981282e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.889115e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138602111070696] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4154s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0170s for 8192 events => throughput is 4.81E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4171s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0191s for 8192 events => throughput is 4.29E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059336795098e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059339015544e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.659841e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.331462e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.743814e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.375376e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138602536968548] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4174s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0091s for 8192 events => throughput is 8.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4247s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4155s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0089s for 8192 events => throughput is 9.23E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138602499179925) differ by less than 4E-4 (2.008725722424387e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138602536968548) differ by less than 4E-4 (2.0007092349505484e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.079796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.572016e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.235810e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.493727e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138602536968548] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4092s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4242s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4155s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0085s for 8192 events => throughput is 9.69E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138602499179925) differ by less than 4E-4 (2.008725722424387e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138602536968548) differ by less than 4E-4 (2.0007092349505484e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.970038e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.743951e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.765544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.946551e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138606840950104] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138606859855095] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4258s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.60E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4274s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4154s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0117s for 8192 events => throughput is 6.98E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138606840950104) differ by less than 4E-4 (1.0876612277499476e-07) +OK! xsec from fortran (47.138611968034176) and cpp (47.138606859855095) differ by less than 4E-4 (1.083650720268281e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.636236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.335193e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.862568e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.440519e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138612402172164] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138612410631097] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8634s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.8632s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8580s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0010s for 8192 events => throughput is 8.31E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cuda (47.138612402172164) differ by less than 4E-4 (9.209817353195149e-09) +OK! xsec from fortran (47.138611968034176) and cuda (47.138612410631097) differ by less than 4E-4 (9.389264921111362e-09) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.093880e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.444767e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.450343e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.714328e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.021092e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.591418e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.359313e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.094164e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.014796e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.593199e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.375647e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.068726e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.628808e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.570569e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.004427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.377359e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 2ae843d323..92ff8109b4 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:05:08 +DATE: 2025-09-24_09:41:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 2613 events (found 5374 events) - [COUNTERS] PROGRAM TOTAL : 0.8439s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8015s - [COUNTERS] Fortran MEs ( 1 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8555s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8122s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0 + [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4498s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] Fortran MEs ( 1 ) : 0.0418s for 8192 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4550s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4116s + [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613306947967] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4576s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0451s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4663s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4186s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0473s for 8192 events => throughput is 1.73E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759566586473e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759344541868e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.815647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.782893e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.845071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.785685e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613306947953] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4358s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4106s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4477s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0267s for 8192 events => throughput is 3.07E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759344541868e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759122497263e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.291111e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.088238e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.339005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.130119e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4251s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4334s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4169s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0161s for 8192 events => throughput is 5.09E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.315398e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.041293e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.422217e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.062752e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4227s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0143s for 8192 events => throughput is 5.73E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4313s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0153s for 8192 events => throughput is 5.35E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.854463e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.522361e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.901611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.428731e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613350418026] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.4322s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4099s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0219s for 8192 events => throughput is 3.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4399s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4158s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0237s for 8192 events => throughput is 3.45E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08) +OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418026) differ by less than 2E-4 (2.9325934569612855e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.724588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.523885e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.694617e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520339e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.14 [47.138611963547788] fbridge_mode=1 + [XSECTION] Cross section = 47.14 [47.138613301020499] fbridge_mode=1 [UNWEIGHT] Wrote 1618 events (found 1623 events) - [COUNTERS] PROGRAM TOTAL : 0.8506s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8468s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.66E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8600s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0012s for 8192 events => throughput is 6.56E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0044s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.138611968034162) and cuda (47.138611963547788) differ by less than 2E-4 (9.517409083059647e-11) +OK! xsec from fortran (47.138611968034176) and cuda (47.138613301020499) differ by less than 2E-4 (2.8278013930460588e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.987528e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.074070e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.325954e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.013331e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.868584e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.920469e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.589038e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036622e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.871326e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.893637e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.949192e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.072188e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.873573e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.913523e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.717025e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621290e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 0c7ed732ed..4e7ca0cd21 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:05:36 +DATE: 2025-09-24_09:41:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7416s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4124s - [COUNTERS] Fortran MEs ( 1 ) : 0.3292s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4146s + [COUNTERS] Fortran MEs ( 1 ) : 0.3426s for 8192 events => throughput is 2.39E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7177s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3873s - [COUNTERS] Fortran MEs ( 1 ) : 0.3304s for 8192 events => throughput is 2.48E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7338s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3903s + [COUNTERS] Fortran MEs ( 1 ) : 0.3435s for 8192 events => throughput is 2.38E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7353s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3872s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3470s for 8192 events => throughput is 2.36E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.7520s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3938s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3570s for 8192 events => throughput is 2.29E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.455924e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.356249e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.454100e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.346054e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5656s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3857s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1793s for 8192 events => throughput is 4.57E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.5864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3914s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1942s for 8192 events => throughput is 4.22E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.669927e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.278626e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.620836e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.269078e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748540E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4792s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3884s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0903s for 8192 events => throughput is 9.07E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4900s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3924s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0970s for 8192 events => throughput is 8.45E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748540E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.331277e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.703454e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.327490e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.541417e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748540E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3876s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0812s for 8192 events => throughput is 1.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4796s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0878s for 8192 events => throughput is 9.33E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748540E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.048553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.482223e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.042752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.443069e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748581E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5035s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3879s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1150s for 8192 events => throughput is 7.13E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3945s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1175s for 8192 events => throughput is 6.97E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748581E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.198283e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.098352e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.275587e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.113802e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8395s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8270s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.61E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0030s + [COUNTERS] PROGRAM TOTAL : 0.8451s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8302s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 8192 events => throughput is 1.20E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0081s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (0.0) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111479e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.216712e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523607e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331258e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 8192 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.454522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110985e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 8192 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.167720e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131706e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.412863e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111080e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.174227e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.157398e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.441638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112521e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.653840e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.025597e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index fbc0c57cb4..640f442f94 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:06:11 +DATE: 2025-09-24_09:42:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7420s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4102s - [COUNTERS] Fortran MEs ( 1 ) : 0.3318s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7606s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4162s + [COUNTERS] Fortran MEs ( 1 ) : 0.3444s for 8192 events => throughput is 2.38E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7176s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s - [COUNTERS] Fortran MEs ( 1 ) : 0.3316s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3898s + [COUNTERS] Fortran MEs ( 1 ) : 0.3438s for 8192 events => throughput is 2.38E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471473453718410E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471473940337211E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7234s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3325s for 8192 events => throughput is 2.46E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s + [COUNTERS] PROGRAM TOTAL : 0.7380s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3930s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3440s for 8192 events => throughput is 2.38E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.574588530672827e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473940337211E-002) differ by less than 4E-4 (1.5125763475065668e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.535876e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.432966e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.542086e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.427997e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459294758378E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471459242542743E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4904s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3886s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1014s for 8192 events => throughput is 8.08E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3918s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1065s for 8192 events => throughput is 7.69E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459294758378E-002) differ by less than 4E-4 (3.37893311330717e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459242542743E-002) differ by less than 4E-4 (3.385587202808793e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.182689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.892225e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.204950e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.896609e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459718665412E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471459599782634E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4358s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3891s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0464s for 8192 events => throughput is 1.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4470s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3963s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0503s for 8192 events => throughput is 1.63E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459718665412E-002) differ by less than 4E-4 (3.324912595248364e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459599782634E-002) differ by less than 4E-4 (3.340062399992405e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.782969e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.654755e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.783579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.675872e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471459718665412E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471459599782634E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4301s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3871s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4423s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3934s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0484s for 8192 events => throughput is 1.69E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459718665412E-002) differ by less than 4E-4 (3.324912595248364e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459599782634E-002) differ by less than 4E-4 (3.340062399992405e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.968891e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.853168e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.969858e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.860470e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471471932611128E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471471449789984E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4447s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3892s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0551s for 8192 events => throughput is 1.49E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4496s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3920s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0571s for 8192 events => throughput is 1.43E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471471932611128E-002) differ by less than 4E-4 (1.768430569759616e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471471449789984E-002) differ by less than 4E-4 (1.8299587956072116e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.481854e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.461497e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.468460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.463643e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471475012321185E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471471527735093E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8327s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.36E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.8445s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8332s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0038s for 8192 events => throughput is 2.14E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0074s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471475012321185E-002) differ by less than 4E-4 (1.375968260441951e-07) +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471471527735093E-002) differ by less than 4E-4 (1.8200258744549558e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.717098e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317074e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.890243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.450306e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 8192 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.313606e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.688503e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 8192 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.232701e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.686499e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.300307e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.685793e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.230438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.646028e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.193713e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.689493e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.247962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.483427e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 2422d3068f..93f6dedb27 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:05:53 +DATE: 2025-09-24_09:42:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 387 events (found 1591 events) - [COUNTERS] PROGRAM TOTAL : 0.7391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4096s - [COUNTERS] Fortran MEs ( 1 ) : 0.3295s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s + [COUNTERS] Fortran MEs ( 1 ) : 0.3429s for 8192 events => throughput is 2.39E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3855s - [COUNTERS] Fortran MEs ( 1 ) : 0.3310s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7324s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3899s + [COUNTERS] Fortran MEs ( 1 ) : 0.3424s for 8192 events => throughput is 2.39E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486590207584E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486590207598E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.7396s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3874s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3510s for 8192 events => throughput is 2.33E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.7532s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3918s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3602s for 8192 events => throughput is 2.27E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765766516956e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486590207598E-002) differ by less than 2E-4 (9.945766210606166e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.409349e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.326368e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.415956e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.324676e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486540430027E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486557993325E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5676s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3876s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1792s for 8192 events => throughput is 4.57E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.5817s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3911s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1899s for 8192 events => throughput is 4.31E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486540430027E-002) differ by less than 2E-4 (9.311426296676473e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486557993325E-002) differ by less than 2E-4 (9.535244149816435e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.653483e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.378332e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.691370e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.388241e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486395956899E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486463614223E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4809s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3907s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0896s for 8192 events => throughput is 9.14E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4866s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3913s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0947s for 8192 events => throughput is 8.65E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486395956899E-002) differ by less than 2E-4 (7.470335683379403e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614223E-002) differ by less than 2E-4 (8.332525780474498e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.402724e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.875956e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.391101e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.709061e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471486395956899E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486463614223E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.4660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3858s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0796s for 8192 events => throughput is 1.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4783s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3908s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0870s for 8192 events => throughput is 9.42E+04 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486395956899E-002) differ by less than 2E-4 (7.470335683379403e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614223E-002) differ by less than 2E-4 (8.332525780474498e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055172e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.543350e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.066925e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.670798e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07847 [7.8471486537749241E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.5026s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3850s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1170s for 8192 events => throughput is 7.00E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.5086s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3920s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1159s for 8192 events => throughput is 7.07E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277263846030337e-09) +OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277264068074942e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.005425e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.165999e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.056979e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.185185e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07847 [7.8471485791426987E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07847 [7.8471486423885309E-002] fbridge_mode=1 [UNWEIGHT] Wrote 376 events (found 1358 events) - [COUNTERS] PROGRAM TOTAL : 0.8432s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0031s + [COUNTERS] PROGRAM TOTAL : 0.8467s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8318s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 8192 events => throughput is 1.20E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0081s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471485791426987E-002) differ by less than 2E-4 (2.334807902570901e-10) +OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471486423885309E-002) differ by less than 2E-4 (7.826240988606514e-09) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.128450e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.212267e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.439893e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.293617e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 8192 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.421024e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111513e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 8192 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.153444e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131049e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.432988e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112576e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.169695e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.159084e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.432146e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112106e+06 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.638179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.027373e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 5517ab4292..5c8e19e910 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:06:26 +DATE: 2025-09-24_09:42:39 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.6353s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3093s - [COUNTERS] Fortran MEs ( 1 ) : 4.3260s for 8192 events => throughput is 1.89E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8674s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3029s + [COUNTERS] Fortran MEs ( 1 ) : 4.5645s for 8192 events => throughput is 1.79E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.5825s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2904s - [COUNTERS] Fortran MEs ( 1 ) : 4.2921s for 8192 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8604s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2983s + [COUNTERS] Fortran MEs ( 1 ) : 4.5621s for 8192 events => throughput is 1.80E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.7512s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4476s for 8192 events => throughput is 1.84E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0090s + [COUNTERS] PROGRAM TOTAL : 4.9836s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2961s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6779s for 8192 events => throughput is 1.75E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0096s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.894558e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.790589e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.891638e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.786472e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786561240180] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.6638s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2927s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3665s for 8192 events => throughput is 3.46E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0045s + [COUNTERS] PROGRAM TOTAL : 2.8513s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2953s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5508s for 8192 events => throughput is 3.21E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240180) differ by less than 3E-14 (5.551115123125783e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.547129e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.252169e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.542201e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.243534e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3332s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0356s for 8192 events => throughput is 7.91E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s + [COUNTERS] PROGRAM TOTAL : 1.4460s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2978s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1455s for 8192 events => throughput is 7.15E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.118919e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.371211e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.114943e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.386101e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.2181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9207s for 8192 events => throughput is 8.90E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + [COUNTERS] PROGRAM TOTAL : 1.3562s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2953s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0584s for 8192 events => throughput is 7.74E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.276674e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.283097e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.241984e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.316461e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.4646s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2912s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1708s for 8192 events => throughput is 7.00E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s + [COUNTERS] PROGRAM TOTAL : 1.5549s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2975s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2543s for 8192 events => throughput is 6.53E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0031s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.083404e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.637926e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.099846e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.577309e+03 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7374s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0388s for 8192 events => throughput is 2.11E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0348s + [COUNTERS] PROGRAM TOTAL : 0.8768s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7472s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0731s for 8192 events => throughput is 1.12E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0565s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240197) differ by less than 3E-14 (0.0) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.149005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.128754e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.350783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152335e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.129093e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.031773e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.172100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.039843e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.126645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.032424e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.170032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.039735e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.144552e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036401e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.426547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.128114e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 78567e12c9..dc699c27b6 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:08:49 +DATE: 2025-09-24_09:44:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.5864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2940s - [COUNTERS] Fortran MEs ( 1 ) : 4.2923s for 8192 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8605s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2988s + [COUNTERS] Fortran MEs ( 1 ) : 4.5616s for 8192 events => throughput is 1.80E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.5924s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2886s - [COUNTERS] Fortran MEs ( 1 ) : 4.3038s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8552s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2967s + [COUNTERS] Fortran MEs ( 1 ) : 4.5585s for 8192 events => throughput is 1.80E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144941544531159] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144941829360230] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.6210s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3184s for 8192 events => throughput is 1.90E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0085s + [COUNTERS] PROGRAM TOTAL : 4.9162s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6117s for 8192 events => throughput is 1.78E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0093s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941544531159) differ by less than 4E-4 (4.675947774535061e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941829360230) differ by less than 4E-4 (4.684541254906804e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.957206e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.821921e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.957921e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823714e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144937378275385] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144937034821881] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.4924s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2933s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1966s for 8192 events => throughput is 6.85E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0025s + [COUNTERS] PROGRAM TOTAL : 1.5793s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2964s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2802s for 8192 events => throughput is 6.40E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144937378275385) differ by less than 4E-4 (4.550249099066761e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144937034821881) differ by less than 4E-4 (4.539886881094191e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.048957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.669582e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.041651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.614973e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144939883924923] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8128s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2926s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5189s for 8192 events => throughput is 1.58E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0013s + [COUNTERS] PROGRAM TOTAL : 0.8721s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5761s for 8192 events => throughput is 1.42E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ by less than 4E-4 (4.609834643787281e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939883924923) differ by less than 4E-4 (4.62584619276285e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.622272e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.458780e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.613287e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.456486e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144939883924923] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7779s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2950s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4817s for 8192 events => throughput is 1.70E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s + [COUNTERS] PROGRAM TOTAL : 0.8324s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5359s for 8192 events => throughput is 1.53E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ by less than 4E-4 (4.609834643787281e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939883924923) differ by less than 4E-4 (4.62584619276285e-06) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.826080e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.615038e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.802534e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.617243e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144947551388249] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144947512238093] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8771s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2920s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5836s for 8192 events => throughput is 1.40E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 0.9197s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2991s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6189s for 8192 events => throughput is 1.32E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0017s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144947551388249) differ by less than 4E-4 (4.857178601991308e-06) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144947512238093) differ by less than 4E-4 (4.855997415953439e-06) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430502e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336496e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.421428e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.341570e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144955535316123] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144805623008405] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.7866s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7350s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0271s for 8192 events => throughput is 3.02E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0245s + [COUNTERS] PROGRAM TOTAL : 0.8568s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7429s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0579s for 8192 events => throughput is 1.42E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0560s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144955535316123) differ by less than 4E-4 (5.0980589545446264e-06) +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144805623008405) differ by less than 4E-4 (5.751060780934125e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.089397e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.427342e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.388762e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.450556e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.126017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.235203e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.254976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.256194e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.087410e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.240003e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.221892e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.276462e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.084262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.251882e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.720302e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 0f7d6f4131..80beb65efd 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:07:37 +DATE: 2025-09-24_09:43:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 223 events) - [COUNTERS] PROGRAM TOTAL : 4.5989s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2922s - [COUNTERS] Fortran MEs ( 1 ) : 4.3067s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8647s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3010s + [COUNTERS] Fortran MEs ( 1 ) : 4.5637s for 8192 events => throughput is 1.80E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.6012s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2899s - [COUNTERS] Fortran MEs ( 1 ) : 4.3113s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8662s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2985s + [COUNTERS] Fortran MEs ( 1 ) : 4.5676s for 8192 events => throughput is 1.79E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786734542164] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 4.8059s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2941s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5027s for 8192 events => throughput is 1.82E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0091s + [COUNTERS] PROGRAM TOTAL : 5.0375s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2977s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7301s for 8192 events => throughput is 1.73E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0097s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786734542164) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.881337e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.768670e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.867505e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.766915e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] ChannelId = 112 [XSECTION] Cross section = 0.3314 [0.33144786651655289] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 2.6829s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2920s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3862s for 8192 events => throughput is 3.43E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0047s + [COUNTERS] PROGRAM TOTAL : 2.7375s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2980s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4343s for 8192 events => throughput is 3.37E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786651655289) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.548157e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422239e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.537868e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.438997e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786627894512] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.3285s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0325s for 8192 events => throughput is 7.93E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0024s + [COUNTERS] PROGRAM TOTAL : 1.4216s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2955s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1235s for 8192 events => throughput is 7.29E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0026s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894512) differ by less than 2E-4 (2.0110044740562216e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.171504e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.443598e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.183239e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.406915e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786627894512] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.1999s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2927s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9051s for 8192 events => throughput is 9.05E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0022s + [COUNTERS] PROGRAM TOTAL : 1.3011s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2981s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0006s for 8192 events => throughput is 8.19E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0024s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894512) differ by less than 2E-4 (2.0110044740562216e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.165581e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.412011e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.350878e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.381821e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786627894512] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 1.4750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2928s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1793s for 8192 events => throughput is 6.95E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0029s + [COUNTERS] PROGRAM TOTAL : 1.5416s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2972s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2413s for 8192 events => throughput is 6.60E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0030s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09) +OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894512) differ by less than 2E-4 (2.0110044740562216e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.035517e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.701255e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.843003e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.645821e+03 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 104 [XSECTION] ChannelId = 112 - [XSECTION] Cross section = 0.3314 [0.33144786533876569] fbridge_mode=1 + [XSECTION] Cross section = 0.3314 [0.33144786662983072] fbridge_mode=1 [UNWEIGHT] Wrote 7 events (found 213 events) - [COUNTERS] PROGRAM TOTAL : 0.8136s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7401s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0389s for 8192 events => throughput is 2.11E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0347s + [COUNTERS] PROGRAM TOTAL : 0.8743s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7450s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0728s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0566s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786533876569) differ by less than 2E-4 (8.255786054789382e-10) +OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786662983072) differ by less than 2E-4 (3.0696494235371574e-09) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.142259e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.127471e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.350796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.156691e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.127674e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.034512e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.154284e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.039347e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.123213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033378e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.173815e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036749e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.121978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.037400e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.416494e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.126745e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 74862dd5f7..0e9d9d2f29 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg - -make USEBUILDDIR=1 BACKEND=cuda +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:10:25 +DATE: 2025-09-24_09:45:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.9475s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5365s - [COUNTERS] Fortran MEs ( 1 ) : 100.4109s for 8192 events => throughput is 8.16E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.4483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5357s + [COUNTERS] Fortran MEs ( 1 ) : 101.9126s for 8192 events => throughput is 8.04E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.8105s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5296s - [COUNTERS] Fortran MEs ( 1 ) : 100.2810s for 8192 events => throughput is 8.17E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.4309s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5346s + [COUNTERS] Fortran MEs ( 1 ) : 101.8963s for 8192 events => throughput is 8.04E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 127.1376s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5284s - [COUNTERS] CudaCpp MEs ( 2 ) : 126.4018s for 8192 events => throughput is 6.48E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2074s + [COUNTERS] PROGRAM TOTAL : 163.4088s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5347s + [COUNTERS] CudaCpp MEs ( 2 ) : 162.5706s for 8192 events => throughput is 5.04E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.3035s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282475E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.678586e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.226735e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.694101e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.259225e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 61.7097s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5331s - [COUNTERS] CudaCpp MEs ( 2 ) : 61.0765s for 8192 events => throughput is 1.34E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1001s + [COUNTERS] PROGRAM TOTAL : 86.5586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5341s + [COUNTERS] CudaCpp MEs ( 2 ) : 85.8620s for 8192 events => throughput is 9.54E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1625s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.591189e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.949901e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.580161e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.747834e+01 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 29.3577s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5263s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.7837s for 8192 events => throughput is 2.85E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0477s + [COUNTERS] PROGRAM TOTAL : 40.6226s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5340s + [COUNTERS] CudaCpp MEs ( 2 ) : 40.0118s for 8192 events => throughput is 2.05E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0767s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.407090e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.131860e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.415212e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.159412e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.2469s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5271s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.6788s for 8192 events => throughput is 3.19E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0410s + [COUNTERS] PROGRAM TOTAL : 37.6969s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5334s + [COUNTERS] CudaCpp MEs ( 2 ) : 37.0922s for 8192 events => throughput is 2.21E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0712s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.913687e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.294972e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.895964e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345904e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 26.1607s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5255s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.5871s for 8192 events => throughput is 3.20E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0481s + [COUNTERS] PROGRAM TOTAL : 34.2348s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5355s + [COUNTERS] CudaCpp MEs ( 2 ) : 33.6335s for 8192 events => throughput is 2.44E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0658s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.408791e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.512973e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.444614e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.511766e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561551282422E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 3.3131s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1215s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1040s for 8192 events => throughput is 7.42E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 1.0875s + [COUNTERS] PROGRAM TOTAL : 4.4192s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2237s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8337s for 8192 events => throughput is 4.47E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 1.3618s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282475E-007) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282422E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.491511e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.501671e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.275455e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.526830e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.282089e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.504711e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.552042e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.533077e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.301465e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.507550e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.448921e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.566191e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.252906e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.504247e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.241973e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.878509e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index bfa4b4cda4..78f71e7886 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:42:40 +DATE: 2025-09-24_10:29:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.8152s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5282s - [COUNTERS] Fortran MEs ( 1 ) : 100.2871s for 8192 events => throughput is 8.17E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.7811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5356s + [COUNTERS] Fortran MEs ( 1 ) : 102.2455s for 8192 events => throughput is 8.01E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.7247s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5322s - [COUNTERS] Fortran MEs ( 1 ) : 100.1925s for 8192 events => throughput is 8.18E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.6282s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5399s + [COUNTERS] Fortran MEs ( 1 ) : 102.0883s for 8192 events => throughput is 8.02E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -100,7 +107,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -108,30 +114,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575849446922190E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575849656360290E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 112.7914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5240s - [COUNTERS] CudaCpp MEs ( 2 ) : 112.0829s for 8192 events => throughput is 7.31E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1845s + [COUNTERS] PROGRAM TOTAL : 149.9291s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5340s + [COUNTERS] CudaCpp MEs ( 2 ) : 149.1165s for 8192 events => throughput is 5.49E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2786s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849446922190E-007) differ by less than 4E-4 (0.00013947977747852391) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849656360290E-007) differ by less than 4E-4 (0.00013948866230428791) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.631916e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.705349e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.625132e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.750941e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -146,7 +152,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -154,30 +159,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575845178322101E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575845268372665E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 28.7980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5271s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.2235s for 8192 events => throughput is 2.90E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0474s + [COUNTERS] PROGRAM TOTAL : 39.0745s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5342s + [COUNTERS] CudaCpp MEs ( 2 ) : 38.4658s for 8192 events => throughput is 2.13E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0745s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845178322101E-007) differ by less than 4E-4 (0.0001392986940575991) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845268372665E-007) differ by less than 4E-4 (0.0001393025142009119) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.386203e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.225201e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.374145e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.224404e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -192,7 +197,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -200,30 +204,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575845201396375E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 14.8120s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5245s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.2638s for 8192 events => throughput is 5.74E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0236s + [COUNTERS] PROGRAM TOTAL : 20.3883s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5317s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.8182s for 8192 events => throughput is 4.13E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0383s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007) differ by less than 4E-4 (0.0001392983160326544) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845201396375E-007) differ by less than 4E-4 (0.00013929967291903544) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.872770e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.313431e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.864576e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.334992e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -238,7 +242,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -246,30 +249,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575845201396375E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.3091s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5262s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.7618s for 8192 events => throughput is 6.42E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0211s + [COUNTERS] PROGRAM TOTAL : 18.8938s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5335s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.3248s for 8192 events => throughput is 4.47E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0355s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007) differ by less than 4E-4 (0.0001392983160326544) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845201396375E-007) differ by less than 4E-4 (0.00013929967291903544) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.728743e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.590348e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.768099e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.692981e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,7 +287,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 @@ -292,30 +294,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575850859831750E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.358e-07 [2.3575850881931771E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 13.2286s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5280s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.6780s for 8192 events => throughput is 6.46E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0225s + [COUNTERS] PROGRAM TOTAL : 17.3656s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5340s + [COUNTERS] CudaCpp MEs ( 2 ) : 16.7981s for 8192 events => throughput is 4.88E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0336s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575850859831750E-007) differ by less than 4E-4 (0.00013953971621538663) +OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575850881931771E-007) differ by less than 4E-4 (0.0001395406537467725) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.948019e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.100289e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.969717e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.096706e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -337,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.358e-07 [2.3575862304433055E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572568179359759E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 2.2079s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1084s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5470s for 8192 events => throughput is 1.50E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.5524s + [COUNTERS] PROGRAM TOTAL : 3.8207s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2702s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1821s for 8192 events => throughput is 6.93E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 1.3684s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3575862304433055E-007) differ by less than 4E-4 (0.00014002522141920437) +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572568179359759E-007) differ by less than 4E-4 (2.8117764494517417e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.517499e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.967387e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.545233e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.769635e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.140576e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.948508e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.181453e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.760546e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.126165e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.948816e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.164632e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.904736e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.163932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.951778e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.073078e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.835102e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 3a68950921..ccd6dbacb7 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg make USEBUILDDIR=1 BACKEND=cuda @@ -9,33 +12,37 @@ make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:26:37 +DATE: 2025-09-24_10:07:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 101.1381s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5302s - [COUNTERS] Fortran MEs ( 1 ) : 100.6080s for 8192 events => throughput is 8.14E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.4899s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5342s + [COUNTERS] Fortran MEs ( 1 ) : 101.9557s for 8192 events => throughput is 8.03E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 100.8808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5357s - [COUNTERS] Fortran MEs ( 1 ) : 100.3451s for 8192 events => throughput is 8.16E+01 events/s + [COUNTERS] PROGRAM TOTAL : 102.5084s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5366s + [COUNTERS] Fortran MEs ( 1 ) : 101.9718s for 8192 events => throughput is 8.03E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561678995975E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 123.7239s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5356s - [COUNTERS] CudaCpp MEs ( 2 ) : 122.9787s for 8192 events => throughput is 6.66E+01 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.2095s + [COUNTERS] PROGRAM TOTAL : 164.8069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5331s + [COUNTERS] CudaCpp MEs ( 2 ) : 163.9850s for 8192 events => throughput is 5.00E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.2889s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561678995975E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.634632e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.294826e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.608909e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.254752e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561701257335E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 64.5975s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5274s - [COUNTERS] CudaCpp MEs ( 2 ) : 63.9661s for 8192 events => throughput is 1.28E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.1041s + [COUNTERS] PROGRAM TOTAL : 85.1648s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5347s + [COUNTERS] CudaCpp MEs ( 2 ) : 84.4703s for 8192 events => throughput is 9.70E+01 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.1598s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561701257335E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.549992e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.034210e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.544779e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028899e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 28.6856s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5254s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.1150s for 8192 events => throughput is 2.91E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0453s + [COUNTERS] PROGRAM TOTAL : 39.0824s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5320s + [COUNTERS] CudaCpp MEs ( 2 ) : 38.4775s for 8192 events => throughput is 2.13E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0729s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.581303e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.206988e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.574698e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.236323e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 24.6205s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5315s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.0503s for 8192 events => throughput is 3.41E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0387s + [COUNTERS] PROGRAM TOTAL : 36.0617s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5329s + [COUNTERS] CudaCpp MEs ( 2 ) : 35.4622s for 8192 events => throughput is 2.31E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0666s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.161373e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.428075e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.184852e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.418588e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 25.7441s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5280s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.1699s for 8192 events => throughput is 3.25E+02 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0462s + [COUNTERS] PROGRAM TOTAL : 33.2889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5333s + [COUNTERS] CudaCpp MEs ( 2 ) : 32.6936s for 8192 events => throughput is 2.51E+02 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0621s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.516660e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.614731e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.515216e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.619097e+02 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.357e-07 [2.3572561518129465E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.357e-07 [2.3572561698474940E-007] fbridge_mode=1 [UNWEIGHT] Wrote 18 events (found 285 events) - [COUNTERS] PROGRAM TOTAL : 2.8461s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0822s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8795s for 8192 events => throughput is 9.31E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.8844s + [COUNTERS] PROGRAM TOTAL : 4.1792s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2194s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6813s for 8192 events => throughput is 4.87E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 1.2785s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561518129465E-007) differ by less than 2E-4 (1.4064212017217415e-09) +OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561698474940E-007) differ by less than 2E-4 (6.244231132157552e-09) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.415473e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.916582e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.080771e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.946512e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.106752e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.914249e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.156598e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.944163e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.106849e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.911624e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.103409e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.995050e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111142e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.914631e+03 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.667428e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331554e+03 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 7310cfc72a..94f8459e48 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:09:42 +DATE: 2025-09-24_09:45:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5319s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4597s - [COUNTERS] Fortran MEs ( 1 ) : 0.0722s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5393s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4659s + [COUNTERS] Fortran MEs ( 1 ) : 0.0734s for 8192 events => throughput is 1.12E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4765s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4047s - [COUNTERS] Fortran MEs ( 1 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4175s + [COUNTERS] Fortran MEs ( 1 ) : 0.0731s for 8192 events => throughput is 1.12E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737132] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4077s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0781s for 8192 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4099s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0802s for 8192 events => throughput is 1.02E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737132) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.073164e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028696e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.079140e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033450e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4492s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4062s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4548s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0454s for 8192 events => throughput is 1.80E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.895347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.786731e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.917908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.801695e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4396s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4134s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0257s for 8192 events => throughput is 3.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4395s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4127s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0262s for 8192 events => throughput is 3.12E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ by less than 3E-14 (2.220446049250313e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.340027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.148665e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.307491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.230434e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4082s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0224s for 8192 events => throughput is 3.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4358s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4102s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0251s for 8192 events => throughput is 3.26E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ by less than 3E-14 (2.220446049250313e-15) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.693677e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.280342e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.718907e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.447820e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4438s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4093s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0340s for 8192 events => throughput is 2.41E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4500s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4131s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0364s for 8192 events => throughput is 2.25E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ by less than 3E-14 (2.220446049250313e-15) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.386493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.325632e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.395890e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.342134e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737173] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8495s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8451s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.52E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.8555s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8481s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.19E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0055s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -348,44 +355,44 @@ OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504505737173) diffe OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.777000e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.866764e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.265214e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.652053e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.327919e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.284283e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.161258e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.180927e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.316740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.268671e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.319766e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.275096e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.323054e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.288694e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646948e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.760684e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 748c92b28c..41a9d43802 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,7 +1,10 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 @@ -9,33 +12,37 @@ make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:10:11 +DATE: 2025-09-24_09:45:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5240s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4523s - [COUNTERS] Fortran MEs ( 1 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4630s + [COUNTERS] Fortran MEs ( 1 ) : 0.0736s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4796s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4074s - [COUNTERS] Fortran MEs ( 1 ) : 0.0721s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4156s + [COUNTERS] Fortran MEs ( 1 ) : 0.0732s for 8192 events => throughput is 1.12E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313506133732837] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313506139857326] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4786s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4057s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0723s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4913s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4131s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0776s for 8192 events => throughput is 1.06E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313506133732837) differ by less than 4E-4 (8.014351782215101e-08) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313506139857326) differ by less than 4E-4 (8.044501620396716e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.132089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064305e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.123977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.070087e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313502997679400] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313502999900010] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4346s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0269s for 8192 events => throughput is 3.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4383s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502997679400) differ by less than 4E-4 (7.423917058879681e-08) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502999900010) differ by less than 4E-4 (7.412985369992242e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.016574e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.827821e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.049161e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.873784e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313502617200768] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4241s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4102s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.03E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ by less than 4E-4 (9.283869628617936e-08) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502617200768) differ by less than 4E-4 (9.296949998738313e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.215183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.180969e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.201945e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.191931e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313502617200768] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4177s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4051s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4243s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4110s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.31E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ by less than 4E-4 (9.283869628617936e-08) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502617200768) differ by less than 4E-4 (9.296949998738313e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.557168e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.453658e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.659565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.428297e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313505300145301] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313505319061453] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4064s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0163s for 8192 events => throughput is 5.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313505300145301) differ by less than 4E-4 (3.910739154733278e-08) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313505319061453) differ by less than 4E-4 (4.003860221146738e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.736521e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.828072e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.799657e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.874410e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313508590887899] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313508403515360] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8496s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8457s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.64E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s + [COUNTERS] PROGRAM TOTAL : 0.8572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8508s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0012s for 8192 events => throughput is 7.00E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508590887899) differ by less than 4E-4 (2.011051698502797e-07) +OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508403515360) differ by less than 4E-4 (1.9188113165036214e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.049327e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.377483e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.339018e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.198635e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.110522e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.020211e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.423874e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.227757e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.090502e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.018085e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.757351e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.194185e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.720065e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.014234e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.206204e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.542247e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index dd13a39319..5c1471e368 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu -make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:09:56 +DATE: 2025-09-24_09:45:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 506 events (found 1943 events) - [COUNTERS] PROGRAM TOTAL : 0.5254s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4537s - [COUNTERS] Fortran MEs ( 1 ) : 0.0717s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5379s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4648s + [COUNTERS] Fortran MEs ( 1 ) : 0.0731s for 8192 events => throughput is 1.12E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4842s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4128s - [COUNTERS] Fortran MEs ( 1 ) : 0.0714s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4903s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4168s + [COUNTERS] Fortran MEs ( 1 ) : 0.0735s for 8192 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504495344831] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4899s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0776s for 8192 events => throughput is 1.06E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.4927s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4116s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0804s for 8192 events => throughput is 1.02E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344831) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.073352e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028885e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.073996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.032709e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504495344833] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504500016025] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4513s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0421s for 8192 events => throughput is 1.94E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4656s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0479s for 8192 events => throughput is 1.71E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344833) differ by less than 2E-4 (5.115952106393706e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500016025) differ by less than 2E-4 (2.816402666638851e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.886911e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.818000e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.898728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.829372e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4424s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4165s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 8192 events => throughput is 3.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4127s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 8192 events => throughput is 3.21E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.243245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.181258e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.311888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.266270e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4308s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0240s for 8192 events => throughput is 3.42E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.793279e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489599e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.775522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.437681e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1 + [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.4486s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0350s for 8192 events => throughput is 2.34E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.4454s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4099s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0349s for 8192 events => throughput is 2.35E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10) +OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.316706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.358017e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.334216e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.421035e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2031 [0.20313504512110778] fbridge_mode=1 [UNWEIGHT] Wrote 499 events (found 1502 events) - [COUNTERS] PROGRAM TOTAL : 0.8511s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8469s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.63E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0011s + [COUNTERS] PROGRAM TOTAL : 0.8540s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8466s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.47E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0056s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -348,44 +355,44 @@ OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504512110778) diffe OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.929266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.782717e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.319589e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.427822e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.340652e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.290753e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.169068e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.175981e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.326566e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.286909e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.337296e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.267053e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.337938e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.287822e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.656612e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.756338e+06 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index d2a669114e..9ff238b7e1 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 + +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:54:40 +DATE: 2025-09-24_10:44:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 0.9766s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9291s - [COUNTERS] Fortran MEs ( 1 ) : 0.0475s for 8192 events => throughput is 1.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0945s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0452s + [COUNTERS] Fortran MEs ( 1 ) : 0.0493s for 8192 events => throughput is 1.66E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4581s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4105s - [COUNTERS] Fortran MEs ( 1 ) : 0.0476s for 8192 events => throughput is 1.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5080s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4584s + [COUNTERS] Fortran MEs ( 1 ) : 0.0497s for 8192 events => throughput is 1.65E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755170] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4592s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0501s for 8192 events => throughput is 1.63E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.5057s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4528s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0524s for 8192 events => throughput is 1.56E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755170) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.648377e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.608463e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.642355e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.611218e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4344s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4065s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4839s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4535s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0300s for 8192 events => throughput is 2.73E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755183) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.984151e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.771322e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.017550e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.772334e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4261s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4538s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0183s for 8192 events => throughput is 4.46E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.938014e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.577287e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.942444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.549926e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4299s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0153s for 8192 events => throughput is 5.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4733s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4559s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0170s for 8192 events => throughput is 4.83E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ b OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.398535e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.674690e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.466636e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.906431e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755179] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4149s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4545s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0254s for 8192 events => throughput is 3.23E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755179) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ by less than 3E-14 (8.881784197001252e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.480162e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.296909e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.526547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.379324e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081479755192] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081479755196] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.8532s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8493s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.62E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8974s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8915s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.87E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0045s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755192) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755196) differ by less than 3E-14 (6.661338147750939e-16) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.920216e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.947513e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.457557e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.552068e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.816989e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.263798e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.149758e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.488706e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.802618e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.278324e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.511448e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.756938e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.832166e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.256429e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.514724e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.399736e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 483bc4166c..bfe8c62394 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:09 +DATE: 2025-09-24_10:45:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 0.9638s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9156s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0965s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0472s + [COUNTERS] Fortran MEs ( 1 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4563s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4085s - [COUNTERS] Fortran MEs ( 1 ) : 0.0478s for 8192 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4977s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4485s + [COUNTERS] Fortran MEs ( 1 ) : 0.0492s for 8192 events => throughput is 1.67E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,26 +114,27 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160406825242951] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160406822335140] fbridge_mode=1 [UNWEIGHT] Wrote 1653 events (found 1658 events) - [COUNTERS] PROGRAM TOTAL : 0.4552s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4076s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0473s for 8192 events => throughput is 1.73E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.5040s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4538s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0498s for 8192 events => throughput is 1.64E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406825242951) differ by less than 4E-4 (1.6138103811513815e-05) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406822335140) differ by less than 4E-4 (1.613795957533526e-05) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! -diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 -7562,7575d7561 -< 4 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00 +diff /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 +8102,8116d8101 +< 5 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00 < 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.71320499473E+02 0.71320499473E+02 0.00000000000E+00 0. 1. < 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239790E+02 0.54771239790E+02 0.00000000000E+00 0. 1. -< 5 1 1 2 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002893E+02 0.63925016162E+02 0.47000000000E+01 0. -1. -< -5 1 1 2 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762567893E+01 0.62166723101E+02 0.47000000000E+01 0. -1. +< 25 2 1 2 0 0 0.00000000000E+00 0.00000000000E+00 0.16549259682E+02 0.12609173926E+03 0.12500099485E+03 0. 0. +< 5 1 3 3 501 0 0.50303102232E+02 0.36190119942E+02 0.14973002893E+02 0.63925016162E+02 0.47000000000E+01 0. -1. +< -5 1 3 3 0 501 -0.50303102232E+02 -0.36190119942E+02 0.15762567893E+01 0.62166723101E+02 0.47000000000E+01 0. -1. < < 0 0.12500099E+03 < 0 diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index b61563e796..ddcc5a005b 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 + make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:54:54 +DATE: 2025-09-24_10:45:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 3371 events (found 6399 events) - [COUNTERS] PROGRAM TOTAL : 0.9594s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9118s - [COUNTERS] Fortran MEs ( 1 ) : 0.0475s for 8192 events => throughput is 1.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1007s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0512s + [COUNTERS] Fortran MEs ( 1 ) : 0.0495s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4589s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4111s - [COUNTERS] Fortran MEs ( 1 ) : 0.0478s for 8192 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4978s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4481s + [COUNTERS] Fortran MEs ( 1 ) : 0.0497s for 8192 events => throughput is 1.65E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -100,7 +107,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -108,33 +114,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081964453331] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081963935692] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4600s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4089s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0507s for 8192 events => throughput is 1.62E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.5084s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4555s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0525s for 8192 events => throughput is 1.56E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964453331) differ by less than 2E-4 (2.4042469792817656e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081963935692) differ by less than 2E-4 (2.401679322083794e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.539881e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.508236e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.532971e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.510960e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -149,7 +152,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -157,33 +159,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081964453336] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081964477738] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4363s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4080s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0279s for 8192 events => throughput is 2.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4822s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4518s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0300s for 8192 events => throughput is 2.73E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964453336) differ by less than 2E-4 (2.404247001486226e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964477738) differ by less than 2E-4 (2.4043680380003707e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.824636e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.648349e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.869373e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.666977e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -198,7 +197,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -206,33 +204,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081962974745] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4311s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0169s for 8192 events => throughput is 4.85E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s + [COUNTERS] PROGRAM TOTAL : 0.4719s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4531s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0183s for 8192 events => throughput is 4.48E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962974745) differ by less than 2E-4 (2.3969127349587893e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.809707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.449043e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.724204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.654852e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -247,7 +242,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -255,33 +249,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081962974745] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4252s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4093s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0155s for 8192 events => throughput is 5.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4690s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4517s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0169s for 8192 events => throughput is 4.85E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962974745) differ by less than 2E-4 (2.3969127349587893e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.163712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.928374e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.204514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.901177e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -296,7 +287,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 @@ -304,33 +294,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081962970020] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.4306s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4058s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.36E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4788s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4530s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0254s for 8192 events => throughput is 3.23E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962970020) differ by less than 2E-4 (2.3968893092529697e-08) +OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.121651e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.246164e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.119023e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.288651e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -352,60 +339,60 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0160081483021330] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0160081946290331] fbridge_mode=1 [UNWEIGHT] Wrote 1652 events (found 1657 events) - [COUNTERS] PROGRAM TOTAL : 0.8574s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.63E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8930s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.90E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0046s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081483021330) differ by less than 2E-4 (1.6201062713605552e-10) +OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081946290331) differ by less than 2E-4 (2.31415309137617e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.018963e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.891035e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.363694e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.983431e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.820757e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.267970e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.067644e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.492702e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.797704e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.273530e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.465309e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.770175e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.821262e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.254943e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.503862e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.399759e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index d3cb91b8cd..885b8f6235 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx make USEBUILDDIR=1 BACKEND=cuda @@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:56:37 +DATE: 2025-09-24_10:47:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.6766s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3643s - [COUNTERS] Fortran MEs ( 1 ) : 2.3123s for 8192 events => throughput is 3.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7247s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3637s + [COUNTERS] Fortran MEs ( 1 ) : 2.3610s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.6640s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3633s - [COUNTERS] Fortran MEs ( 1 ) : 2.3007s for 8192 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7259s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3722s + [COUNTERS] Fortran MEs ( 1 ) : 2.3537s for 8192 events => throughput is 3.48E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3633s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4822s for 8192 events => throughput is 3.30E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s + [COUNTERS] PROGRAM TOTAL : 2.8837s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3696s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5087s for 8192 events => throughput is 3.27E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0053s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.457369e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.344967e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.441555e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331486e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728610E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.6655s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3645s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2984s for 8192 events => throughput is 6.31E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s + [COUNTERS] PROGRAM TOTAL : 1.7868s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3714s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4123s for 8192 events => throughput is 5.80E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0031s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.514132e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.905219e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.544925e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.868019e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9435s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3668s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5751s for 8192 events => throughput is 1.42E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 1.0028s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3733s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6279s for 8192 events => throughput is 1.30E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0017s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728578E-007) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.460459e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.355383e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.466853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.352140e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3647s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5144s for 8192 events => throughput is 1.59E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s + [COUNTERS] PROGRAM TOTAL : 0.9343s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3694s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5633s for 8192 events => throughput is 1.45E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728578E-007) differ by less than 3E-14 (0.0) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.641494e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.516928e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.655223e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.520810e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0440s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3665s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6757s for 8192 events => throughput is 1.21E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0018s + [COUNTERS] PROGRAM TOTAL : 1.0825s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7080s for 8192 events => throughput is 1.16E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0020s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728578E-007) differ by less than 3E-14 (0.0) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.221115e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.175579e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225553e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172330e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8457s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8061s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0199s for 8192 events => throughput is 4.13E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0198s + [COUNTERS] PROGRAM TOTAL : 0.8868s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8154s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0385s for 8192 events => throughput is 2.13E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0329s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (0.0) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.230611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.122719e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.541816e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.161966e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 4096 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.854537e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.749173e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 4096 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.229320e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.719568e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.859903e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.749862e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.225591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.734145e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.850975e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.747981e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.687847e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.787425e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 10c15cf9d1..9028aeb504 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx - +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone + + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 - make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:58:07 +DATE: 2025-09-24_10:48:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.6755s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3603s - [COUNTERS] Fortran MEs ( 1 ) : 2.3152s for 8192 events => throughput is 3.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7248s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3655s + [COUNTERS] Fortran MEs ( 1 ) : 2.3593s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.6754s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3698s - [COUNTERS] Fortran MEs ( 1 ) : 2.3056s for 8192 events => throughput is 3.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7319s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3721s + [COUNTERS] Fortran MEs ( 1 ) : 2.3598s for 8192 events => throughput is 3.47E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381686438954397E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381686572538756E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3659s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4358s for 8192 events => throughput is 3.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.8065s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3706s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4308s for 8192 events => throughput is 3.37E+03 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0050s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381686438954397E-007) differ by less than 4E-4 (9.960018576560259e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686572538756E-007) differ by less than 4E-4 (9.977507651193207e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.485505e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422260e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.473644e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.438824e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381671483253128E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381671512533574E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0546s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3688s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6842s for 8192 events => throughput is 1.20E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 1.0936s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3692s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7227s for 8192 events => throughput is 1.13E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0017s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381671483253128E-007) differ by less than 4E-4 (8.001994753481512e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381671512533574E-007) differ by less than 4E-4 (8.005828195933873e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232148e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.164944e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.242719e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.167862e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381672199194947E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6626s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3670s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2947s for 8192 events => throughput is 2.78E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0009s + [COUNTERS] PROGRAM TOTAL : 0.6882s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3714s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3158s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672199194947E-007) differ by less than 4E-4 (8.095726977686013e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.866680e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.676459e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.814611e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.691626e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381672199194947E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.6345s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3672s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2664s for 8192 events => throughput is 3.07E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.6551s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3702s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2840s for 8192 events => throughput is 2.88E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672199194947E-007) differ by less than 4E-4 (8.095726977686013e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.183014e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.977246e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.199503e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.981214e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381686320975603E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381686626552808E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.7045s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3656s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3379s for 8192 events => throughput is 2.42E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0010s + [COUNTERS] PROGRAM TOTAL : 0.7233s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3509s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0012s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381686320975603E-007) differ by less than 4E-4 (9.944572607611946e-07) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686626552808E-007) differ by less than 4E-4 (9.98457925449614e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.460974e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.378670e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.436294e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.375697e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381711031958629E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381615658692040E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8419s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8049s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0197s for 8192 events => throughput is 4.15E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0172s + [COUNTERS] PROGRAM TOTAL : 0.8824s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0336s for 8192 events => throughput is 2.44E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0314s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381711031958629E-007) differ by less than 4E-4 (1.3179773188376487e-06) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381615658692040E-007) differ by less than 4E-4 (6.933558260868722e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.233915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.459531e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.454452e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.481553e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 4096 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.300238e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.290559e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 4096 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.323216e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.192752e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.294935e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.309301e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.322990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.053552e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.292471e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.272688e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.654983e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.911013e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 9cff3d3d2c..d798df1f36 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx make USEBUILDDIR=1 BACKEND=cuda @@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:57:22 +DATE: 2025-09-24_10:47:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 902 events) - [COUNTERS] PROGRAM TOTAL : 2.6661s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3588s - [COUNTERS] Fortran MEs ( 1 ) : 2.3072s for 8192 events => throughput is 3.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7272s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3656s + [COUNTERS] Fortran MEs ( 1 ) : 2.3616s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.6664s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3632s - [COUNTERS] Fortran MEs ( 1 ) : 2.3031s for 8192 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s + [COUNTERS] Fortran MEs ( 1 ) : 2.3582s for 8192 events => throughput is 3.47E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608764955655E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 2.8757s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3651s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.5054s for 8192 events => throughput is 3.27E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0052s + [COUNTERS] PROGRAM TOTAL : 2.8976s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3714s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5208s for 8192 events => throughput is 3.25E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0054s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293319738268e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293208715966e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.427512e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319062e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.426484e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319549e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608686521600E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.6394s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3662s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2706s for 8192 events => throughput is 6.45E+03 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0027s + [COUNTERS] PROGRAM TOTAL : 1.7267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3721s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3516s for 8192 events => throughput is 6.06E+03 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0030s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164241365944e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164130343642e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.733385e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.130825e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.780255e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.109980e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.9411s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3649s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5747s for 8192 events => throughput is 1.43E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s + [COUNTERS] PROGRAM TOTAL : 0.9912s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6185s for 8192 events => throughput is 1.32E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0016s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.446717e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.371139e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.473262e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.357048e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3656s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5015s for 8192 events => throughput is 1.63E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0014s + [COUNTERS] PROGRAM TOTAL : 0.9295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5569s for 8192 events => throughput is 1.47E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0015s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.681650e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528728e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.668117e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.538029e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 1.0574s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3699s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6857s for 8192 events => throughput is 1.19E+04 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0018s + [COUNTERS] PROGRAM TOTAL : 1.0827s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7094s for 8192 events => throughput is 1.15E+04 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0020s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08) +OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232369e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.181329e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.216790e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.179267e+04 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.638e-07 [7.6381610372590318E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.638e-07 [7.6381608831823612E-007] fbridge_mode=1 [UNWEIGHT] Wrote 230 events (found 851 events) - [COUNTERS] PROGRAM TOTAL : 0.8397s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8000s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0198s for 8192 events => throughput is 4.13E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0198s + [COUNTERS] PROGRAM TOTAL : 0.8907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8195s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0383s for 8192 events => throughput is 2.14E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0329s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381610372590318E-007) differ by less than 2E-4 (1.2911138824733825e-10) +OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381608831823612E-007) differ by less than 2E-4 (2.0042847492796056e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.219575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.100571e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.527801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.161024e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 4096 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.836972e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.747749e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX) -p 4096 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.176072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.719794e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.835271e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.751159e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.206917e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.733503e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.823749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.750316e+05 ) sec^-1 -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.671807e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.790163e+05 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index f18eaf3551..d3447e0b2c 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:56 +DATE: 2025-09-24_10:46:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6925s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6838s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7006s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.44E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4263s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4177s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4289s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4203s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.51E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4188s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0084s for 8192 events => throughput is 9.73E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4384s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4279s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 8192 events => throughput is 8.05E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.916439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.517606e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.017065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.556272e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4290s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4241s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 8192 events => throughput is 1.79E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4412s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4354s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.913729e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.736519e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.928329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804687e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4198s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.80E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4304s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.57E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.118646e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.871535e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.327279e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.044911e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4242s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.07E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4292s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4257s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.63E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.142389e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.039759e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.418661e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.248707e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4276s + [COUNTERS] PROGRAM TOTAL : 0.4281s [COUNTERS] Fortran Overhead ( 0 ) : 0.4241s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.67E+06 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.27E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.810680e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.586266e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.123505e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.787533e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426109] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8668s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.69E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.8640s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8595s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.02E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0037s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -348,44 +355,44 @@ OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452343426109) diffe OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.094441e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.327611e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.576690e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877091e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.540792e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655959e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.885377e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.354113e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.486109e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.658621e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.914518e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.391364e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.512059e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.658194e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.224875e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.400250e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 9cee2ab297..c88c021ccd 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 - +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:56:23 +DATE: 2025-09-24_10:46:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6965s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6879s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6976s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6890s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.49E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4263s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4177s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4300s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4213s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.49E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446496609361] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 8192 events => throughput is 9.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4375s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4271s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.16E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446496609361) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.006620e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.685035e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.012762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.581417e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449446369440458] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4190s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4301s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4266s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.49E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446369440458) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.282555e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.779402e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.369793e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.773921e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446595743795] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4206s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.08E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4278s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4257s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.50E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ by less than 4E-4 (1.881300697448296e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446595743795) differ by less than 4E-4 (1.8876143514923172e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.872977e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.742333e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.148892e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.686508e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446595743795] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4201s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4180s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.44E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4262s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.56E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ by less than 4E-4 (1.881300697448296e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446595743795) differ by less than 4E-4 (1.8876143514923172e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.886846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.187003e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.506416e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.926848e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449447031649013] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449446997188218] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4202s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4176s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4281s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4258s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.09E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cpp (0.30449447031649013) differ by less than 4E-4 (1.744457354124762e-07) +OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446997188218) differ by less than 4E-4 (1.7557747322705097e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.376595e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.995899e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.863933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.536276e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449447352014630] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449447100896687] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8576s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8540s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.64E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.8663s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8620s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0036s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447352014630) differ by less than 4E-4 (1.639245078566276e-07) +OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447100896687) differ by less than 4E-4 (1.7217155090509806e-07) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.209039e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.950959e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.497762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.932846e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.599688e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.122837e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.103544e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.713910e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.606706e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.139014e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.131283e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.219680e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.229812e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.999475e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.664371e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.855116e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 782fee34a5..fb8e1884d6 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,4 +1,7 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x make USEBUILDDIR=1 BACKEND=cuda @@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:56:09 +DATE: 2025-09-24_10:46:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -58,8 +65,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1732 events (found 4297 events) - [COUNTERS] PROGRAM TOTAL : 0.6953s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6866s + [COUNTERS] PROGRAM TOTAL : 0.6987s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6901s [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.49E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** @@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4236s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4180s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4277s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4187s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.54E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4398s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4292s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0103s for 8192 events => throughput is 7.98E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.831908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.422189e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.918457e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.535671e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4287s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 8192 events => throughput is 1.83E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4393s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4334s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.49E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.892977e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.767119e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.974211e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.859923e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4216s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4183s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.84E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4260s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.66E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.237521e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.984842e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.477152e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.183434e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4270s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4304s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4270s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.75E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.311234e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.997535e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.507028e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294958e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] ChannelId = 3 [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.4247s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4289s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4249s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.926715e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.666307e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.198931e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.789154e+06 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 2 [XSECTION] ChannelId = 3 - [XSECTION] Cross section = 0.3045 [0.30449452360186230] fbridge_mode=1 + [XSECTION] Cross section = 0.3045 [0.30449453200208287] fbridge_mode=1 [UNWEIGHT] Wrote 1612 events (found 1617 events) - [COUNTERS] PROGRAM TOTAL : 0.8627s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8591s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0005s + [COUNTERS] PROGRAM TOTAL : 0.8609s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8564s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.05E+07 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0037s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452360186230) differ by less than 2E-4 (5.504239286580059e-10) +OK! xsec from fortran (0.30449452343426120) and cuda (0.30449453200208287) differ by less than 2E-4 (2.813785138222613e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.206349e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.339486e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.536038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.858130e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.506637e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655045e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.900315e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.353515e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.486873e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.662294e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.921916e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.390355e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.466467e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.659752e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.235205e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.405982e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index bebebe43ae..1a5ebcb708 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=cuda - -make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:13 +DATE: 2025-09-24_10:45:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8496s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8074s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8571s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8136s + [COUNTERS] Fortran MEs ( 1 ) : 0.0436s for 8192 events => throughput is 1.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4529s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4105s - [COUNTERS] Fortran MEs ( 1 ) : 0.0424s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4584s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s + [COUNTERS] Fortran MEs ( 1 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846964] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4621s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4167s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0450s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4712s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4239s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0468s for 8192 events => throughput is 1.75E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846964) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846964) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.859940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795171e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.839978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795530e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4403s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4153s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4493s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4216s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 8192 events => throughput is 3.00E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846957) differ by less than 3E-14 (0.0) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.243144e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.063303e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.273347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.061071e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4324s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4161s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4397s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4227s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0166s for 8192 events => throughput is 4.94E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.210364e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.021968e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.310117e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.062640e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4350s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4201s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0146s for 8192 events => throughput is 5.62E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4387s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4225s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0158s for 8192 events => throughput is 5.19E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.704117e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.201601e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.793092e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.417673e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4434s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4195s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4470s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4222s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0245s for 8192 events => throughput is 3.35E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.552376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.432574e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.639783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.453192e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8650s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8612s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.66E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0008s + [COUNTERS] PROGRAM TOTAL : 0.8664s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8608s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.36E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0043s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cuda (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (44.641911695846943) and cuda (44.641911695846943) differ by less than 3E-14 (0.0) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.043338e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.914793e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.325784e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.001985e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.871559e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.910414e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.143094e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.037284e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.865333e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.909403e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.020534e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073169e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.868423e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.916674e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.708181e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623185e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 2a76a737ac..8da465d5f4 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx + make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone - make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:42 +DATE: 2025-09-24_10:46:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8397s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7971s - [COUNTERS] Fortran MEs ( 1 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8588s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8149s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4141s + [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641906072918047] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641906076692848] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4624s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4686s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4240s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641906072918047) differ by less than 4E-4 (1.2595627507661078e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641906076692848) differ by less than 4E-4 (1.2587171749345316e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.972969e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.876039e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.987350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883693e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641902189470080] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641902218109820] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4356s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4182s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4450s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4258s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0190s for 8192 events => throughput is 4.31E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641902189470080) differ by less than 4E-4 (2.1294735186305758e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641902218109820) differ by less than 4E-4 (2.123058078229434e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.748983e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.348521e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.695429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.342860e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641902360162746] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4214s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.83E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4321s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4230s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0089s for 8192 events => throughput is 9.20E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761755559866e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641902360162746) differ by less than 4E-4 (2.0912375486847878e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.169652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.500870e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.239468e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.292903e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641902360162746] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4221s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0086s for 8192 events => throughput is 9.50E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0002s + [COUNTERS] PROGRAM TOTAL : 0.4313s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4223s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0088s for 8192 events => throughput is 9.30E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761755559866e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641902360162746) differ by less than 4E-4 (2.0912375486847878e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.627165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.427731e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.935546e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.637478e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641906399820272] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641906412232359] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4293s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4169s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4333s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4215s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0115s for 8192 events => throughput is 7.12E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641906399820272) differ by less than 4E-4 (1.1863351012664225e-07) +OK! xsec from fortran (44.641911695846943) and cpp (44.641906412232359) differ by less than 4E-4 (1.1835547319982709e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.774461e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400687e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.994273e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.434626e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641910992291372] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641910985805701] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8577s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8540s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0006s + [COUNTERS] PROGRAM TOTAL : 0.8631s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8579s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0010s for 8192 events => throughput is 8.09E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0042s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cuda (44.641910992291372) differ by less than 4E-4 (1.575997887748315e-08) +OK! xsec from fortran (44.641911695846943) and cuda (44.641910985805701) differ by less than 4E-4 (1.5905260664084153e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.201092e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.390475e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.452650e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.833171e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.883185e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.588623e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.341479e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.095229e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.843740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.590889e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.360831e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.068812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.608054e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.535879e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.014740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.379682e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 449e459bdc..de2afdefc1 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,41 +1,48 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= +HASBLAS=hasBlas +Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx make USEBUILDDIR=1 BACKEND=cuda - make USEBUILDDIR=1 BACKEND=cppnone -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 + make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' + +CUDACPP_RUNTIME_BLASCOLORSUM= + +CUDACPP_RUNTIME_CUBLASTF32TENSOR= OMP_NUM_THREADS= -DATE: 2024-10-06_10:55:27 +DATE: 2025-09-24_10:45:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 2625 events (found 5368 events) - [COUNTERS] PROGRAM TOTAL : 0.8469s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8046s - [COUNTERS] Fortran MEs ( 1 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8158s + [COUNTERS] Fortran MEs ( 1 ) : 0.0438s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0 + [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4519s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4098s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4140s + [COUNTERS] Fortran MEs ( 1 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4618s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0455s for 8192 events => throughput is 1.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4714s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4239s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0471s for 8192 events => throughput is 1.74E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912938404218) differ by less than 2E-4 (2.783387209603916e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.833802e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.775167e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.834236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.783124e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4439s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4188s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4504s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4238s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0262s for 8192 events => throughput is 3.13E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912938404218) differ by less than 2E-4 (2.783387209603916e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.367073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.106214e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.340820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.103998e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4302s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4144s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4398s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4230s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.97E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.283261e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.205244e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.353744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.190124e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] ChannelId = 1 [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4151s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0143s for 8192 events => throughput is 5.75E+05 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0003s + [COUNTERS] PROGRAM TOTAL : 0.4387s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4231s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0152s for 8192 events => throughput is 5.39E+05 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.825518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.454547e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.928231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.572443e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912970378172] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.4383s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0220s for 8192 events => throughput is 3.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4487s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4246s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0237s for 8192 events => throughput is 3.45E+05 events/s [COUNTERS] CudaCpp HEL ( 3 ) : 0.0004s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08) +OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378172) differ by less than 2E-4 (2.8550104058666648e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.615578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.539675e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732261e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512456e+05 ) sec^-1 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 } [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.64 [44.641911674225568] fbridge_mode=1 + [XSECTION] Cross section = 44.64 [44.641912968724782] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8598s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8560s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s - [COUNTERS] CudaCpp HEL ( 3 ) : 0.0007s + [COUNTERS] PROGRAM TOTAL : 0.8642s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8585s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0013s for 8192 events => throughput is 6.40E+06 events/s + [COUNTERS] CudaCpp HEL ( 3 ) : 0.0044s *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (44.641911695846957) and cuda (44.641911674225568) differ by less than 2E-4 (4.843293543999039e-10) +OK! xsec from fortran (44.641911695846943) and cuda (44.641912968724782) differ by less than 2E-4 (2.8513067462654362e-08) *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.907482e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.967399e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.361691e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.032617e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.875077e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.932611e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.567905e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.037431e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.865156e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.923206e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.911973e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073184e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.881287e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.923583e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.733673e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621614e+07 ) sec^-1 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index 56fbce5d92..cc0039f545 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -251,11 +251,11 @@ function getgridmax() elif [ "${ggtt}" == "1" ]; then echo 16384 32 # same total grid dimension as 2048 256 elif [ "${ggttg}" == "1" ]; then - echo 16384 32 # same total grid dimension as 2048 256 + echo 8192 32 # same total grid dimension as 1024 256 (new sep2025) elif [ "${ggttgg}" == "1" ]; then - echo 16384 32 # same total grid dimension as 2048 256 + echo 512 32 # same total grid dimension as 64 256 (new sep2025: even 1024/32 aborts in max8thr mode) elif [ "${ggttggg}" == "1" ]; then - echo 512 32 # same total grid dimension as 64 256 + echo 256 32 # same total grid dimension as 32 256 (new sep2025) elif [ "${gguu}" == "1" ]; then echo 16384 32 # same total grid dimension as 2048 256 elif [ "${gqttq}" == "1" ]; then @@ -267,7 +267,7 @@ function getgridmax() elif [ "${susyggt1t1}" == "1" ]; then echo 16384 32 # same total grid dimension as 2048 256 elif [ "${smeftggtttt}" == "1" ]; then - echo 16384 32 # same total grid dimension as 2048 256 + echo 4096 32 # same total grid dimension as 512 256 (new sep2025) else echo "ERROR! Unknown process" > /dev/stderr; usage fi @@ -478,9 +478,15 @@ function runmadevent() # PART 1 - build madevent ########################################################################## +echo MADGRAPH_CUDA_ARCHITECTURE=${MADGRAPH_CUDA_ARCHITECTURE} +echo MADGRAPH_HIP_ARCHITECTURE=${MADGRAPH_HIP_ARCHITECTURE} + unset GTEST_ROOT unset LOCALGTEST +export HASBLAS=hasBlas +echo HASBLAS=${HASBLAS} + for suff in $suffs; do dir=$(showdir) @@ -511,6 +517,12 @@ if [ "${maketype}" == "-makeonly" ]; then printf "\nMAKE COMPLETED\n"; exit 0; f # PART 2 - run madevent ########################################################################## +unset CUDACPP_RUNTIME_BLASCOLORSUM +printf "\nCUDACPP_RUNTIME_BLASCOLORSUM=$CUDACPP_RUNTIME_BLASCOLORSUM\n" + +unset CUDACPP_RUNTIME_CUBLASTF32TENSOR +printf "\nCUDACPP_RUNTIME_CUBLASTF32TENSOR=$CUDACPP_RUNTIME_CUBLASTF32TENSOR\n" + printf "\nOMP_NUM_THREADS=$OMP_NUM_THREADS\n" printf "\nDATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n" diff --git a/epochX/cudacpp/tmad/strip10x.sh b/epochX/cudacpp/tmad/strip10x.sh new file mode 100755 index 0000000000..571d134a64 --- /dev/null +++ b/epochX/cudacpp/tmad/strip10x.sh @@ -0,0 +1,11 @@ +#!/bin/sh +# Copyright (C) 2020-2025 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin. + +cd $(dirname $0) +for log in logs_*/log*.txt ; do + cat $log | awk 'BEGIN{ok=1}; /^\*\*\*/{if ($5=="x10") ok=0; else ok=1}; {if (ok==1) print $0}' > ${log}.new + mv ${log}.new ${log} +done diff --git a/epochX/cudacpp/tput/allTees.sh b/epochX/cudacpp/tput/allTees.sh index 69ef153764..28d282be16 100755 --- a/epochX/cudacpp/tput/allTees.sh +++ b/epochX/cudacpp/tput/allTees.sh @@ -1,8 +1,8 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Apr 2022) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin. scrdir=$(cd $(dirname $0); pwd) @@ -20,7 +20,7 @@ if [ "$(hostname)" == "itgold91.cern.ch" ]; then bblds=-cpponly; fi # Usage function usage() { - echo "Usage (1): $0 [-short] [-e] [-sa] [-makeonly] [-nomakeclean] [-hip|-nocuda|-cpponly] [-bsmonly|-nobsm]" + echo "Usage (1): $0 [-short] [-e] [-sa] [-makeonly] [-nomakeclean] [-hip|-nocuda|-cpponly] [-bsmonly|-nobsm|-scalingonly|-blasonly|-blasandscalingonly]" echo "Run tests and check all logs" echo "" echo "Usage (2): $0 -checkonly" @@ -32,7 +32,10 @@ function usage() checkonly=0 ggttggg=-ggttggg rndhst=-curhst -bsm= +sm=1 +bsm=1 +scaling=1 +blas=1 if [ "$1" == "-checkonly" ]; then # Check existing logs without running any tests? checkonly=1 @@ -73,11 +76,35 @@ while [ "${checkonly}" == "0" ] && [ "$1" != "" ]; do if [ "${bblds}" != "" ] && [ "${bblds}" != "$1" ]; then echo "ERROR! Incompatible option $1: backend builds are already defined as '$bblds'"; usage; fi bblds="$1" shift - elif [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then - bsm=$1 + elif [ "$1" == "-bsmonly" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then + sm=0 + bsm=1 + scaling=0 + blas=0 shift - elif [ "$1" == "-nobsm" ] && [ "$bsm" != "-bsmonly" ]; then - bsm=$1 + elif [ "$1" == "-nobsm" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then + sm=1 + bsm=0 + scaling=1 + blas=1 + shift + elif [ "$1" == "-scalingonly" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then + sm=0 + bsm=0 + scaling=1 + blas=0 + shift + elif [ "$1" == "-blasonly" ] && [ "${blas}${scaling}${bsm}${blas}" == "1111" ]; then + sm=0 + bsm=0 + scaling=0 + blas=1 + shift + elif [ "$1" == "-blasandscalingonly" ] && [ "${blas}${scaling}${bsm}${blas}" == "1111" ]; then + sm=0 + bsm=0 + scaling=1 + blas=1 shift else usage @@ -88,11 +115,28 @@ done function checklogs() { cd $scrdir/.. - # Print out any errors in the logs - if ! egrep -i '(error|fault|failed)' ./tput/logs_* -r; then echo "No errors found in logs"; fi + # Print out any errors in the logs (exclude scaling logs) + if ! egrep -i '(error|fault|failed)' ./tput/logs_*/*.txt; then echo "No errors found in logs"; fi # Print out any FPEs or '{ }' in the logs echo if ! egrep '(^Floating Point Exception|{ })' tput/logs* -r; then echo "No FPEs or '{ }' found in logs"; fi + # Print out any aborts in the logs (exclude scaling logs) + echo + txt=$(grep Abort ./tput/logs_*/*.txt | sed "s|\:.*SubProcesses/P|: P|") + if [ "${txt}" == "" ]; then + echo "No aborts found in logs" + else + echo "${txt}" + fi + # Print out any asserts/aborts in scaling logs + echo + txt=$(egrep -i '(abort|assert)' ./tput/logs_*/*.scaling | sed "s|\:.*SubProcesses/P|: P|" | sort -u) + if [ "${txt}" == "" ]; then + echo "No aborts or asserts found in scaling logs" + else + echo "${txt}" + fi + # Print out the MEK channelid debugging output (except for '{ }') echo \grep MEK ${scrdir}/logs_*/* | sed "s|${scrdir}/logs_||" | grep -v '{ }' | sed 's|_mad.*DEBUG:||' | sort -u @@ -123,11 +167,11 @@ fi cd $scrdir/.. started="STARTED AT $(date)" -# (36/102) Six logs (double/mixed/float x hrd0/hrd1 x inl0) in each of the six SM processes +# (+36: 36/138) Six logs (double/mixed/float x hrd0/hrd1 x inl0) in each of the six SM processes [sm==1] \rm -rf gg_ttggg${suff}/lib/build.none_* cmd="./tput/teeThroughputX.sh -dmf -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg ${makeclean} ${opts}" tmp1=$(mktemp) -if [ "${bsm}" != "-bsmonly" ]; then +if [ "${sm}" == "1" ]; then $cmd; status=$? ls -ltr ee_mumu${suff}/lib/build.none_*_inl0_hrd* gg_tt${suff}/lib/build.none_*_inl0_hrd* gg_tt*g${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp1 else @@ -135,86 +179,140 @@ else fi ended1="$cmd\nENDED(1) AT $(date) [Status=$status]" -# (48/102) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six SM processes +# (+18: 54/138) Three scaling logs (double/mixed/float x hrd0 x inl0) in each of the six SM processes [scaling==1] +if [ "${scaling}" == "1" ]; then + if [ "${sm}" == "1" ]; then + cmd="./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -scaling ${opts}" # no rebuild needed + $cmd; status=$? + else + cmd="./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -scaling ${makeclean} ${opts}" # this is the first build + $cmd; status=$? + fi +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? +fi +ended1sc="$cmd\nENDED(1-scaling) AT $(date) [Status=$status]" + +# (+6: 60/138) Three extra logs (double/float x hrd0 x inl0 + blasOn) only in two of the six SM processes (rebuild may be needed) [blas==1] +if [ "${blas}" == "1" ]; then + if [ "${sm}" == "1" ] || [ "${scaling}" == "1" ]; then + cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn ${opts}" # no rebuild needed + $cmd; status=$? + else + cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn ${makeclean} ${opts}" # this is the first build + $cmd; status=$? + fi +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? +fi +ended2="$cmd\nENDED(2) AT $(date) [Status=$status]" + +# (+6: 66/138) Three scaling logs (double/float x hrd0 x inl0 + blasOn) only in two of the six SM processes [blas==1 || scaling==1] +if [ "${blas}" == "1" ] || [ "${scaling}" == "1" ]; then + cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn -scaling ${opts}" # no rebuild needed + $cmd; status=$? +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? +fi +ended2sc="$cmd\nENDED(2-scaling) AT $(date) [Status=$status]" + +# (+12: 78/138) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six SM processes [sm==1] \rm -rf gg_ttg${suff}/lib/build.none_* \rm -rf gg_ttggg${suff}/lib/build.none_* cmd="./tput/teeThroughputX.sh -d_f -hrd -makej -eemumu -ggtt -ggttgg -inlonly ${makeclean} ${opts}" -tmp2=$(mktemp) -if [ "${bsm}" != "-bsmonly" ]; then +tmp3=$(mktemp) +if [ "${sm}" == "1" ]; then $cmd; status=$? - ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2 + ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp3 else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended2="$cmd\nENDED(2) AT $(date) [Status=$status]" +ended3="$cmd\nENDED(3) AT $(date) [Status=$status]" -# (60/102) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six SM processes (rebuild from cache) +# (+12: 90/138) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six SM processes (rebuild from cache) [sm==1] cmd="./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg $ggttggg -d_f -bridge ${makeclean} ${opts}" -if [ "${bsm}" != "-bsmonly" ]; then +if [ "${sm}" == "1" ]; then $cmd; status=$? else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended3="$cmd\nENDED(3) AT $(date) [Status=$status]" +ended4="$cmd\nENDED(4) AT $(date) [Status=$status]" -# (66/102) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six SM processes (no rebuild needed) +# (+6: 96/138) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six SM processes (no rebuild needed) [sm==1] cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -rmbhst ${opts}" -if [ "${bsm}" != "-bsmonly" ]; then +if [ "${sm}" == "1" ]; then $cmd; status=$? else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended4="$cmd\nENDED(4) AT $(date) [Status=$status]" +ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" -# (72/102) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six SM processes (no rebuild needed) +# (+6: 102/138) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six SM processes (no rebuild needed) [sm==1] cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f ${rndhst} ${opts}" -if [ "${bsm}" != "-bsmonly" ] && [ "${rndhst}" != "-common" ]; then +if [ "${sm}" == "1" ] && [ "${rndhst}" != "-common" ]; then $cmd; status=$? else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" +ended6="$cmd\nENDED(6) AT $(date) [Status=$status]" -# (78/102) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six SM processes (no rebuild needed) +# (+6: 108/138) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six SM processes (no rebuild needed) [sm==1] cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -common ${opts}" -if [ "${bsm}" != "-bsmonly" ]; then +if [ "${sm}" == "1" ]; then $cmd; status=$? else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended6="$cmd\nENDED(6) AT $(date) [Status=$status]" +ended7="$cmd\nENDED(7) AT $(date) [Status=$status]" -# (102/102) Six extra logs (double/mixed/float x hrd0/hrd1 x inl0) only in the four BSM processes +# (+6: 114/138) Three extra logs (double/float x hrd0 x inl0 + noBlas) only in two of the six SM processes (rebuild is needed) [blas==1] +cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -noBlas ${makeclean} ${opts}" +if [ "${blas}" == "1" ]; then + $cmd; status=$? +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? +fi +ended8="$cmd\nENDED(8) AT $(date) [Status=$status]" + +# (+24: 138/138) Six extra logs (double/mixed/float x hrd0/hrd1 x inl0) only in the four BSM processes [bsm==1] cmd="./tput/teeThroughputX.sh -dmf -hrd -makej -susyggtt -susyggt1t1 -smeftggtttt -heftggbb ${makeclean} ${opts}" -tmp3=$(mktemp) -if [ "${bsm}" != "-nobsm" ]; then +tmp9=$(mktemp) +if [ "${bsm}" == "1" ]; then $cmd; status=$? - ls -ltr susy_gg_tt${suff}/lib/build.none_*_inl0_hrd* susy_gg_t1t1${suff}/lib/build.none_*_inl0_hrd* smeft_gg_tttt${suff}/lib/build.none_*_inl0_hrd* heft_gg_bb${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2 + ls -ltr susy_gg_tt${suff}/lib/build.none_*_inl0_hrd* susy_gg_t1t1${suff}/lib/build.none_*_inl0_hrd* smeft_gg_tttt${suff}/lib/build.none_*_inl0_hrd* heft_gg_bb${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp9 else cmd="SKIP '$cmd'"; echo $cmd; status=$? fi -ended7="$cmd\nENDED(7) AT $(date) [Status=$status]" +ended9="$cmd\nENDED(9) AT $(date) [Status=$status]" echo echo "Build(1):" cat $tmp1 echo -echo "Build(2):" -cat $tmp2 +echo "Build(3):" +cat $tmp3 +echo +echo "Build(9):" +cat $tmp9 echo echo -e "$started" echo -e "$ended1" +echo -e "$ended1sc" echo -e "$ended2" +echo -e "$ended2sc" echo -e "$ended3" echo -e "$ended4" echo -e "$ended5" echo -e "$ended6" echo -e "$ended7" +echo -e "$ended8" +echo -e "$ended9" if [ "$ggttggg" == "" ]; then echo echo "To complete the test for ggttggg type:" echo " ./tput/teeThroughputX.sh -dmf -hrd -makej -ggttggg ${makeclean} ${opts}" + echo " ./tput/teeThroughputX.sh -dmf -makej -ggttggg -scaling ${makeclean} ${opts}" echo " ./tput/teeThroughputX.sh -makej -ggttggg -d_f -bridge ${makeclean} ${opts}" fi diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..f5d01d4a4b --- /dev/null +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-09-24_08:15:51 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.055845e+06 1 256 +3.753858e+06 2 256 +7.335611e+06 4 256 +1.381152e+07 8 256 +2.631089e+07 16 256 +4.227169e+07 32 256 +4.374783e+07 64 256 +4.667347e+07 128 256 +5.300001e+07 256 256 +5.625799e+07 512 256 +5.575404e+07 1024 256 +### GPU: scaling test 32 +2.730935e+05 1 32 +5.353588e+05 2 32 +1.179560e+06 4 32 +2.202511e+06 8 32 +3.971147e+06 16 32 +8.376963e+06 32 32 +1.455166e+07 64 32 +2.952242e+07 128 32 +4.540844e+07 256 32 +4.671586e+07 512 32 +4.971688e+07 1024 32 +5.165973e+07 2048 32 +5.386422e+07 4096 32 +5.487265e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.243679e+05 1 256 +9.852786e+05 2 256 +1.021348e+06 4 256 +### CPU: scaling test 32 +1.016809e+06 1 32 +8.445055e+05 2 32 +9.845700e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.489316e+06 1 256 +1.586913e+06 2 256 +1.430959e+06 4 256 +### CPU: scaling test 32 +1.667709e+06 1 32 +1.336229e+06 2 32 +1.494088e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.628634e+06 1 256 +2.644232e+06 2 256 +2.663047e+06 4 256 +### CPU: scaling test 32 +2.356407e+06 1 32 +2.491921e+06 2 32 +2.561998e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.011800e+06 1 256 +3.045534e+06 2 256 +2.739301e+06 4 256 +### CPU: scaling test 32 +1.316222e+06 1 32 +1.911018e+06 2 32 +2.740313e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.926826e+06 1 256 +1.820645e+06 2 256 +1.899254e+06 4 256 +### CPU: scaling test 32 +8.705825e+05 1 32 +1.101378e+06 2 32 +1.693816e+06 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 2396150f34..7c9c492237 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:54:52 +DATE: 2025-09-24_07:41:13 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.715157e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.495446e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.756115e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.556711e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.831304e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.057101e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.722104 sec -INFO: No Floating Point Exceptions have been reported - 2,722,047,064 cycles # 2.855 GHz - 4,240,638,296 instructions # 1.56 insn per cycle - 1.034081868 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.768807 sec + 2,919,942,359 cycles # 2.817 GHz + 4,594,739,998 instructions # 1.57 insn per cycle + 1.096295578 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 92 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.013288e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.182482e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.182482e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.758023e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.130964e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.130964e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.662912 sec -INFO: No Floating Point Exceptions have been reported - 19,208,633,801 cycles # 2.880 GHz - 46,193,026,925 instructions # 2.40 insn per cycle - 6.677929994 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.865866 sec + 19,669,960,331 cycles # 2.864 GHz + 48,570,214,354 instructions # 2.47 insn per cycle + 6.870925701 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 436) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.534189e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.004053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.004053e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.385509e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.759182e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.759182e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.563122 sec -INFO: No Floating Point Exceptions have been reported - 13,135,626,695 cycles # 2.874 GHz - 31,728,680,952 instructions # 2.42 insn per cycle - 4.573724377 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.964307 sec + 14,204,686,514 cycles # 2.859 GHz + 35,715,585,672 instructions # 2.51 insn per cycle + 4.969392504 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.938790e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.711147e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.711147e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.974759e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.773943e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.773943e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.694529 sec -INFO: No Floating Point Exceptions have been reported - 10,256,024,954 cycles # 2.769 GHz - 19,694,743,800 instructions # 1.92 insn per cycle - 3.707450749 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 3.601424 sec + 9,848,259,104 cycles # 2.732 GHz + 18,994,035,594 instructions # 1.93 insn per cycle + 3.606544786 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1839) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.944800e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.743029e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.743029e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.049473e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.929995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.929995e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.690433 sec -INFO: No Floating Point Exceptions have been reported - 10,133,821,420 cycles # 2.743 GHz - 19,357,887,145 instructions # 1.91 insn per cycle - 3.703105135 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.484592 sec + 9,443,607,033 cycles # 2.707 GHz + 18,622,238,372 instructions # 1.97 insn per cycle + 3.489655116 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.663763e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.201339e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.201339e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.507592e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.925987e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925987e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.240790 sec -INFO: No Floating Point Exceptions have been reported - 8,791,817,571 cycles # 2.072 GHz - 15,864,118,825 instructions # 1.80 insn per cycle - 4.252718180 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.589973 sec + 9,300,481,242 cycles # 2.025 GHz + 14,435,776,634 instructions # 1.55 insn per cycle + 4.595177363 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1096) (512y: 40) (512z: 1204) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 97960252e7..bd44162217 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,252 +10,220 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:36:32 +DATE: 2025-09-24_08:54:03 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.729675e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.983590e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.983590e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.265806e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.377961e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.377961e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.228883 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 7,241,513,211 cycles # 2.923 GHz - 12,978,693,777 instructions # 1.79 insn per cycle - 2.533005072 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +TOTAL : 2.391774 sec + 7,602,481,238 cycles # 2.845 GHz + 13,479,408,447 instructions # 1.77 insn per cycle + 2.730155096 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 92 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.954014e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154803e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.154803e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.445576e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.088395e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.088395e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.972350 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 20,384,148,235 cycles # 2.919 GHz - 46,410,615,309 instructions # 2.28 insn per cycle - 6.984536194 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.289356 sec + 20,861,496,468 cycles # 2.861 GHz + 48,776,484,238 instructions # 2.34 insn per cycle + 7.296242635 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 436) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.493408e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.921090e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.921090e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.325510e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.664094e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.664094e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.877492 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 14,402,886,877 cycles # 2.946 GHz - 32,567,021,239 instructions # 2.26 insn per cycle - 4.890045852 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.384133 sec + 15,443,300,297 cycles # 2.865 GHz + 36,497,013,571 instructions # 2.36 insn per cycle + 5.391017672 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.864025e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.539449e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.539449e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.847889e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.533992e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.533992e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.048395 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 11,503,225,226 cycles # 2.834 GHz - 21,048,377,803 instructions # 1.83 insn per cycle - 4.060868426 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 4.043561 sec + 11,058,209,453 cycles # 2.731 GHz + 20,294,980,684 instructions # 1.84 insn per cycle + 4.050288399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1839) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.889652e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.596697e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.596697e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.944286e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.700153e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.700153e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.001389 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 11,334,121,636 cycles # 2.824 GHz - 20,717,870,984 instructions # 1.83 insn per cycle - 4.014529771 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.878834 sec + 10,652,009,033 cycles # 2.742 GHz + 19,921,987,104 instructions # 1.87 insn per cycle + 3.885646041 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.585647e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.044820e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.044820e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.429194e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.797955e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.797955e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.655129 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,329,600,614 cycles # 2.214 GHz - 17,028,538,054 instructions # 1.65 insn per cycle - 4.667149794 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 5.047263 sec + 10,578,425,795 cycles # 2.094 GHz + 15,527,346,588 instructions # 1.47 insn per cycle + 5.054089505 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1096) (512y: 40) (512z: 1204) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index a07615eec8..58c909e9f4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:48:44 +DATE: 2025-09-24_09:13:18 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.479194e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.613891e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.774308e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.363761e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.973740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.252514e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.350685 sec -INFO: No Floating Point Exceptions have been reported - 4,619,154,070 cycles # 2.910 GHz - 7,244,933,472 instructions # 1.57 insn per cycle - 1.645096659 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.467223 sec + 4,932,154,711 cycles # 2.846 GHz + 7,711,580,022 instructions # 1.56 insn per cycle + 1.792606628 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 92 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.031231e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.202853e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.202853e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.789468e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.133945e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.133945e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.892564 sec -INFO: No Floating Point Exceptions have been reported - 20,216,212,113 cycles # 2.933 GHz - 46,211,289,901 instructions # 2.29 insn per cycle - 6.898049528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.217356 sec + 20,725,832,251 cycles # 2.870 GHz + 48,670,692,155 instructions # 2.35 insn per cycle + 7.222545153 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 436) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.575355e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.054940e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.054940e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.391545e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.764575e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.764575e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.814476 sec -INFO: No Floating Point Exceptions have been reported - 14,161,512,947 cycles # 2.938 GHz - 31,718,115,030 instructions # 2.24 insn per cycle - 4.820285845 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.317482 sec + 15,270,332,925 cycles # 2.870 GHz + 35,716,070,561 instructions # 2.34 insn per cycle + 5.322758311 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.990481e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.780031e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.780031e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.981842e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.797171e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.797171e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.993697 sec -INFO: No Floating Point Exceptions have been reported - 11,344,220,574 cycles # 2.837 GHz - 19,628,934,109 instructions # 1.73 insn per cycle - 3.999571252 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 3.963389 sec + 10,924,799,733 cycles # 2.754 GHz + 18,895,724,466 instructions # 1.73 insn per cycle + 3.968762806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1839) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.024448e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.841239e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.841239e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.089877e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.986900e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.986900e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.944999 sec -INFO: No Floating Point Exceptions have been reported - 11,153,243,188 cycles # 2.824 GHz - 19,098,861,484 instructions # 1.71 insn per cycle - 3.950731996 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.804740 sec + 10,498,672,406 cycles # 2.756 GHz + 18,322,189,412 instructions # 1.75 insn per cycle + 3.809974126 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.731970e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.289397e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.289397e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.508177e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.928540e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.928540e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.507668 sec -INFO: No Floating Point Exceptions have been reported - 9,996,448,485 cycles # 2.215 GHz - 15,693,646,767 instructions # 1.57 insn per cycle - 4.513790217 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.968681 sec + 10,404,827,448 cycles # 2.093 GHz + 14,135,132,724 instructions # 1.36 insn per cycle + 4.974084771 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1096) (512y: 40) (512z: 1204) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index cf4e1a1e41..c37a287617 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:45:58 +DATE: 2025-09-24_09:09:19 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.516686e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.553796e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.802555e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.361627e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.035071e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.318529e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.994565 sec -INFO: No Floating Point Exceptions have been reported - 3,557,200,491 cycles # 2.898 GHz - 7,056,373,361 instructions # 1.98 insn per cycle - 1.285636058 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.100243 sec + 3,874,724,836 cycles # 2.841 GHz + 7,570,184,430 instructions # 1.95 insn per cycle + 1.422395260 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 92 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.036397e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.208868e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.208868e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.804396e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.134896e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.134896e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.480716 sec -INFO: No Floating Point Exceptions have been reported - 19,050,518,676 cycles # 2.938 GHz - 46,087,808,907 instructions # 2.42 insn per cycle - 6.486425223 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.834548 sec + 19,642,417,445 cycles # 2.872 GHz + 48,568,603,522 instructions # 2.47 insn per cycle + 6.839870974 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 436) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.562645e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.044042e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.044042e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.392923e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.767493e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.767493e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.441615 sec -INFO: No Floating Point Exceptions have been reported - 13,100,732,544 cycles # 2.946 GHz - 31,624,731,275 instructions # 2.41 insn per cycle - 4.447190414 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.939023 sec + 14,201,994,217 cycles # 2.873 GHz + 35,713,557,240 instructions # 2.51 insn per cycle + 4.944238524 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.962342e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.741135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.741135e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.988977e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.808357e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.808357e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.617520 sec -INFO: No Floating Point Exceptions have been reported - 10,105,971,200 cycles # 2.790 GHz - 19,587,417,861 instructions # 1.94 insn per cycle - 3.623303854 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 3.580138 sec + 9,843,208,156 cycles # 2.746 GHz + 18,992,852,718 instructions # 1.93 insn per cycle + 3.585425477 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1839) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.035108e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.854302e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.854302e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.090358e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.998949e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.998949e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.498153 sec -INFO: No Floating Point Exceptions have been reported - 9,879,352,969 cycles # 2.820 GHz - 19,249,039,766 instructions # 1.95 insn per cycle - 3.504047287 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.424021 sec + 9,418,501,088 cycles # 2.747 GHz + 18,610,322,009 instructions # 1.98 insn per cycle + 3.429438977 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.738426e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.300548e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.300548e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.510092e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.930355e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.930355e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.029997 sec -INFO: No Floating Point Exceptions have been reported - 8,617,786,478 cycles # 2.136 GHz - 15,755,373,979 instructions # 1.83 insn per cycle - 4.035885525 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.586814 sec + 9,309,449,457 cycles # 2.028 GHz + 14,433,282,109 instructions # 1.55 insn per cycle + 4.591899749 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1096) (512y: 40) (512z: 1204) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 23a95e9b43..5bde08df61 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,235 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:43:10 +DATE: 2025-09-24_09:05:24 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.035607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566958e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.715605e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.425623e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.947117e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.223633e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.900996 sec -INFO: No Floating Point Exceptions have been reported - 6,141,367,935 cycles # 2.877 GHz - 11,470,611,621 instructions # 1.87 insn per cycle - 2.190401749 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +TOTAL : 2.024476 sec + 6,554,480,812 cycles # 2.850 GHz + 12,028,784,772 instructions # 1.84 insn per cycle + 2.355414793 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 92 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.040250e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.212161e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.212161e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.805260e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.135528e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.135528e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.458941 sec -INFO: No Floating Point Exceptions have been reported - 19,062,791,283 cycles # 2.949 GHz - 46,091,693,422 instructions # 2.42 insn per cycle - 6.464859061 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.833543 sec + 19,634,743,501 cycles # 2.872 GHz + 48,568,052,404 instructions # 2.47 insn per cycle + 6.838759660 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 436) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.576646e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.057103e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.057103e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.393394e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.766191e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.766191e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.406292 sec -INFO: No Floating Point Exceptions have been reported - 12,965,800,121 cycles # 2.939 GHz - 31,623,980,844 instructions # 2.44 insn per cycle - 4.412202935 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.937510 sec + 14,192,824,862 cycles # 2.872 GHz + 35,714,478,545 instructions # 2.52 insn per cycle + 4.942916419 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.982815e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.782156e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.782156e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.977434e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.796954e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.796954e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.583602 sec -INFO: No Floating Point Exceptions have been reported - 10,107,254,042 cycles # 2.816 GHz - 19,587,412,579 instructions # 1.94 insn per cycle - 3.589639966 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1909) (512y: 0) (512z: 0) +TOTAL : 3.598950 sec + 9,864,189,293 cycles # 2.738 GHz + 18,993,120,819 instructions # 1.93 insn per cycle + 3.604424478 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1839) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.036151e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856576e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856576e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.084517e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.984691e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.984691e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.497964 sec -INFO: No Floating Point Exceptions have been reported - 9,879,922,849 cycles # 2.820 GHz - 19,260,007,955 instructions # 1.95 insn per cycle - 3.503929332 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 180) (512z: 0) +TOTAL : 3.431804 sec + 9,421,776,417 cycles # 2.742 GHz + 18,621,468,870 instructions # 1.98 insn per cycle + 3.437118206 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.741980e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.303561e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.303561e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.507112e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.920261e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920261e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.020543 sec -INFO: No Floating Point Exceptions have been reported - 8,613,807,526 cycles # 2.140 GHz - 15,755,294,312 instructions # 1.83 insn per cycle - 4.026429840 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 871) (512y: 156) (512z: 1258) +TOTAL : 4.592567 sec + 9,283,183,314 cycles # 2.020 GHz + 14,433,736,941 instructions # 1.55 insn per cycle + 4.597725180 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1096) (512y: 40) (512z: 1204) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 25ac5b33ed..389d0ed02a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:55:23 +DATE: 2025-09-24_07:41:50 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.275982e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504846e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.746692e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.491499e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.666545e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.879724e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.698739 sec -INFO: No Floating Point Exceptions have been reported - 2,671,543,996 cycles # 2.868 GHz - 4,201,680,962 instructions # 1.57 insn per cycle - 1.042000131 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.765930 sec + 2,948,048,535 cycles # 2.831 GHz + 4,629,826,040 instructions # 1.57 insn per cycle + 1.098401672 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 80 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.030289e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210430e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.210430e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.778603e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.132034e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.132034e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.580678 sec -INFO: No Floating Point Exceptions have been reported - 19,388,414,039 cycles # 2.942 GHz - 46,168,116,276 instructions # 2.38 insn per cycle - 6.592554583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 452) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.849593 sec + 19,698,450,548 cycles # 2.874 GHz + 48,569,342,520 instructions # 2.47 insn per cycle + 6.854647316 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 426) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.571872e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.069657e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.069657e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.390958e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.763981e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.763981e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.454497 sec -INFO: No Floating Point Exceptions have been reported - 13,123,917,893 cycles # 2.941 GHz - 31,665,954,915 instructions # 2.41 insn per cycle - 4.468095413 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1648) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.945278 sec + 14,201,944,495 cycles # 2.870 GHz + 35,713,685,555 instructions # 2.51 insn per cycle + 4.950471310 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1818) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.982748e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.777393e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.777393e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.940285e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.776799e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.776799e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.618164 sec -INFO: No Floating Point Exceptions have been reported - 10,210,665,805 cycles # 2.814 GHz - 19,682,748,403 instructions # 1.93 insn per cycle - 3.629801888 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1894) (512y: 0) (512z: 0) +TOTAL : 3.658331 sec + 9,992,108,985 cycles # 2.728 GHz + 18,993,420,215 instructions # 1.90 insn per cycle + 3.663464040 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.010638e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.831487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.831487e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.098194e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.007315e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.007315e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.575425 sec -INFO: No Floating Point Exceptions have been reported - 10,055,677,244 cycles # 2.805 GHz - 19,379,411,405 instructions # 1.93 insn per cycle - 3.588891240 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1636) (512y: 178) (512z: 0) +TOTAL : 3.410724 sec + 9,387,426,669 cycles # 2.749 GHz + 18,611,041,074 instructions # 1.98 insn per cycle + 3.415936625 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1696) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.768631e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372427e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372427e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.511079e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.926773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.926773e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.003265 sec -INFO: No Floating Point Exceptions have been reported - 8,643,505,927 cycles # 2.154 GHz - 15,697,303,734 instructions # 1.82 insn per cycle - 4.017112338 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 833) (512y: 153) (512z: 1240) +TOTAL : 4.580455 sec + 9,266,907,927 cycles # 2.022 GHz + 14,433,883,234 instructions # 1.56 insn per cycle + 4.585529206 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1076) (512y: 40) (512z: 1204) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 9d9181639f..6748decf76 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:26:55 +DATE: 2025-09-24_08:43:46 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.029061e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569612e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.860356e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.323552e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.961045e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.238128e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.695056 sec -INFO: No Floating Point Exceptions have been reported - 2,704,879,803 cycles # 2.897 GHz - 4,231,460,596 instructions # 1.56 insn per cycle - 0.994220648 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.795139 sec + 3,018,785,307 cycles # 2.812 GHz + 4,761,719,789 instructions # 1.58 insn per cycle + 1.135917551 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 92 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.606609e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.069672e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.069672e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.081749e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.276169e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276169e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.367451 sec -INFO: No Floating Point Exceptions have been reported - 12,912,062,009 cycles # 2.950 GHz - 32,678,927,799 instructions # 2.53 insn per cycle - 4.379017229 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 281) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.239151 sec + 17,816,549,785 cycles # 2.857 GHz + 42,851,983,164 instructions # 2.41 insn per cycle + 6.244587137 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 378) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.977635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.819919e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.819919e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.563944e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.054942e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.054942e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.639596 sec -INFO: No Floating Point Exceptions have been reported - 10,716,876,159 cycles # 2.936 GHz - 25,005,426,831 instructions # 2.33 insn per cycle - 3.651343591 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1246) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.448005 sec + 12,743,189,658 cycles # 2.863 GHz + 30,332,835,629 instructions # 2.38 insn per cycle + 4.453458721 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1656) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.209379e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.259757e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.259757e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.047332e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.929117e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.929117e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.300079 sec -INFO: No Floating Point Exceptions have been reported - 9,398,178,742 cycles # 2.838 GHz - 16,938,114,674 instructions # 1.80 insn per cycle - 3.311853262 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1599) (512y: 0) (512z: 0) +TOTAL : 3.490476 sec + 9,538,570,843 cycles # 2.729 GHz + 17,003,613,920 instructions # 1.78 insn per cycle + 3.496120642 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1745) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.277311e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.397001e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.397001e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.199768e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.213460e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.213460e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.212263 sec -INFO: No Floating Point Exceptions have been reported - 9,139,009,296 cycles # 2.835 GHz - 16,502,297,129 instructions # 1.81 insn per cycle - 3.223908096 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1355) (512y: 139) (512z: 0) +TOTAL : 3.274613 sec + 8,968,933,674 cycles # 2.735 GHz + 16,216,764,925 instructions # 1.81 insn per cycle + 3.280120332 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1571) (512y: 20) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.921368e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.661482e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.661482e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.524186e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949184e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949184e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.727052 sec -INFO: No Floating Point Exceptions have been reported - 8,146,634,535 cycles # 2.180 GHz - 14,661,732,896 instructions # 1.80 insn per cycle - 3.738643291 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1003) (512y: 158) (512z: 946) +TOTAL : 4.546459 sec + 9,195,001,582 cycles # 2.021 GHz + 13,328,215,793 instructions # 1.45 insn per cycle + 4.551986341 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 850) (512y: 32) (512z: 1193) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index abe54e8953..16f206046e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:27:22 +DATE: 2025-09-24_08:44:21 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.921706e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.715910e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.877358e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.209909e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.602152e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.846272e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.678632 sec -INFO: No Floating Point Exceptions have been reported - 2,636,898,249 cycles # 2.884 GHz - 4,067,260,892 instructions # 1.54 insn per cycle - 0.973352356 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.791421 sec + 2,992,715,233 cycles # 2.825 GHz + 4,674,908,446 instructions # 1.56 insn per cycle + 1.118373682 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 80 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.084164e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.941928e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.941928e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.088576e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.283204e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.283204e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.469758 sec -INFO: No Floating Point Exceptions have been reported - 10,217,900,291 cycles # 2.936 GHz - 25,614,437,724 instructions # 2.51 insn per cycle - 3.480862891 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 236) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.198929 sec + 17,796,756,971 cycles # 2.869 GHz + 42,710,158,130 instructions # 2.40 insn per cycle + 6.204593111 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 379) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.313032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.558172e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.558172e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.587243e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.096649e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.096649e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.173341 sec -INFO: No Floating Point Exceptions have been reported - 9,354,473,123 cycles # 2.939 GHz - 21,650,720,885 instructions # 2.31 insn per cycle - 3.184272296 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1112) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.387338 sec + 12,598,423,061 cycles # 2.869 GHz + 29,976,674,000 instructions # 2.38 insn per cycle + 4.392925762 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1638) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.358550e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.604458e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.604458e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.057092e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.937431e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.937431e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.115986 sec -INFO: No Floating Point Exceptions have been reported - 8,850,186,465 cycles # 2.831 GHz - 16,062,849,181 instructions # 1.81 insn per cycle - 3.126797345 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) +TOTAL : 3.475327 sec + 9,528,175,479 cycles # 2.738 GHz + 16,858,022,582 instructions # 1.77 insn per cycle + 3.481000779 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1700) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.422935e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.724037e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.724037e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.212448e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.240328e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.240328e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.043301 sec -INFO: No Floating Point Exceptions have been reported - 8,651,791,606 cycles # 2.834 GHz - 15,666,461,627 instructions # 1.81 insn per cycle - 3.054177777 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1264) (512y: 141) (512z: 0) +TOTAL : 3.258089 sec + 8,944,166,150 cycles # 2.741 GHz + 16,099,275,495 instructions # 1.80 insn per cycle + 3.263523085 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1529) (512y: 20) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.052275e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.908416e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.908416e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.536323e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.970964e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.970964e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.514529 sec -INFO: No Floating Point Exceptions have been reported - 7,791,531,975 cycles # 2.211 GHz - 14,393,714,103 instructions # 1.85 insn per cycle - 3.525649878 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1031) (512y: 164) (512z: 876) +TOTAL : 4.514695 sec + 9,168,428,013 cycles # 2.029 GHz + 13,276,409,493 instructions # 1.45 insn per cycle + 4.520383256 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 812) (512y: 32) (512z: 1193) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..d210e40b75 --- /dev/null +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-09-24_08:16:33 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.995915e+06 1 256 +4.200509e+06 2 256 +8.083870e+06 4 256 +1.608305e+07 8 256 +3.127720e+07 16 256 +4.266755e+07 32 256 +7.804246e+07 64 256 +8.515747e+07 128 256 +9.772246e+07 256 256 +1.061822e+08 512 256 +1.118796e+08 1024 256 +### GPU: scaling test 32 +3.184492e+05 1 32 +6.285479e+05 2 32 +1.262626e+06 4 32 +2.123354e+06 8 32 +4.536835e+06 16 32 +8.483985e+06 32 32 +1.721211e+07 64 32 +3.215777e+07 128 32 +5.609384e+07 256 32 +7.417904e+07 512 32 +9.275336e+07 1024 32 +9.687981e+07 2048 32 +1.055152e+08 4096 32 +1.093946e+08 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.077237e+06 1 256 +1.027248e+06 2 256 +1.058976e+06 4 256 +### CPU: scaling test 32 +1.051456e+06 1 32 +1.083149e+06 2 32 +1.016704e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.464572e+06 1 256 +2.473478e+06 2 256 +2.516181e+06 4 256 +### CPU: scaling test 32 +2.544529e+06 1 32 +1.386963e+06 2 32 +2.294853e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.225139e+06 1 256 +4.673750e+06 2 256 +5.300893e+06 4 256 +### CPU: scaling test 32 +1.913876e+06 1 32 +2.472284e+06 2 32 +3.577518e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.736952e+06 1 256 +4.935987e+06 2 256 +5.245257e+06 4 256 +### CPU: scaling test 32 +1.886125e+06 1 32 +4.425697e+06 2 32 +3.692805e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.704132e+06 1 256 +3.924725e+06 2 256 +4.057052e+06 4 256 +### CPU: scaling test 32 +4.436434e+06 1 32 +4.954711e+06 2 32 +3.376418e+06 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index fa697401ba..2544ccf168 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:56:56 +DATE: 2025-09-24_07:43:43 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.318402e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.547340e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.573294e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.831331e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147275e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192627e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.580539 sec -INFO: No Floating Point Exceptions have been reported - 2,318,735,379 cycles # 2.865 GHz - 3,612,120,055 instructions # 1.56 insn per cycle - 0.879357898 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.629958 sec + 2,533,063,587 cycles # 2.828 GHz + 3,919,306,559 instructions # 1.55 insn per cycle + 0.954740453 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 40 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.072197e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.275533e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275533e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.002594e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.174243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.174243e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.274259 sec -INFO: No Floating Point Exceptions have been reported - 18,464,131,410 cycles # 2.940 GHz - 45,058,020,075 instructions # 2.44 insn per cycle - 6.281329583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.656687 sec + 19,080,722,877 cycles # 2.866 GHz + 47,603,966,972 instructions # 2.49 insn per cycle + 6.661349360 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 414) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.257463e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.446957e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.446957e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.997743e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.907583e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.907583e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.180369 sec -INFO: No Floating Point Exceptions have been reported - 9,372,467,471 cycles # 2.941 GHz - 22,319,965,268 instructions # 2.38 insn per cycle - 3.189536232 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.532505 sec + 10,129,083,857 cycles # 2.864 GHz + 24,678,199,043 instructions # 2.44 insn per cycle + 3.538004837 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039330997854E-002 +Relative difference = 5.215154825545255e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.408379e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.710073e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.710073e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.020145e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.374335e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.374335e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.001033 sec -INFO: No Floating Point Exceptions have been reported - 8,493,792,111 cycles # 2.825 GHz - 15,797,222,111 instructions # 1.86 insn per cycle - 3.010052254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 2.458457 sec + 6,780,782,058 cycles # 2.757 GHz + 13,852,462,300 instructions # 2.04 insn per cycle + 2.463155894 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2178) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.426130e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.768067e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.768067e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.125396e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.710524e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.710524e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.984448 sec -INFO: No Floating Point Exceptions have been reported - 8,427,466,763 cycles # 2.818 GHz - 15,640,000,146 instructions # 1.86 insn per cycle - 2.993491493 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 2.391515 sec + 6,609,679,244 cycles # 2.759 GHz + 13,637,397,629 instructions # 2.06 insn per cycle + 2.396879313 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2078) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.427110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.709739e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.709739e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.559405e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.068028e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.068028e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.984139 sec -INFO: No Floating Point Exceptions have been reported - 6,725,622,216 cycles # 2.249 GHz - 12,910,486,373 instructions # 1.92 insn per cycle - 2.994013668 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 2.839980 sec + 6,297,156,841 cycles # 2.214 GHz + 11,613,908,330 instructions # 1.84 insn per cycle + 2.845523276 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1554) (512y: 0) (512z: 1309) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052638724330E-002 +Relative difference = 2.0569956691141665e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 9136826931..2d596cea9c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,255 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:37:07 +DATE: 2025-09-24_08:54:44 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.256593e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.121486e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.121486e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.537867e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.260892e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.260892e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.691319 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 5,610,006,933 cycles # 2.911 GHz - 10,218,919,767 instructions # 1.82 insn per cycle - 1.984436466 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +TOTAL : 1.800983 sec + 5,877,978,226 cycles # 2.848 GHz + 10,604,089,001 instructions # 1.80 insn per cycle + 2.121531688 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram1": launch__registers_per_thread 40 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.060836e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.248384e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.248384e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.857283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.151088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.151088e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.418392 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 18,916,088,440 cycles # 2.945 GHz - 45,156,650,630 instructions # 2.39 insn per cycle - 6.425565221 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.870612 sec + 19,709,472,888 cycles # 2.867 GHz + 47,719,122,117 instructions # 2.42 insn per cycle + 6.877276018 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 414) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.163234e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.223206e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.223206e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.916311e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.741610e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.741610e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.414716 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,073,193,872 cycles # 2.945 GHz - 23,610,645,909 instructions # 2.34 insn per cycle - 3.421707000 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.794312 sec + 10,876,376,303 cycles # 2.862 GHz + 25,980,668,027 instructions # 2.39 insn per cycle + 3.800880902 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039330997854E-002 +Relative difference = 5.215154825545255e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.302389e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.467769e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.467769e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.867500e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.917538e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.917538e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.241454 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 9,215,037,610 cycles # 2.837 GHz - 16,874,646,512 instructions # 1.83 insn per cycle - 3.248598680 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 2.702979 sec + 7,478,549,066 cycles # 2.761 GHz + 14,938,162,567 instructions # 2.00 insn per cycle + 2.709529742 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2178) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.316990e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.533576e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.533576e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.974162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.230625e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.230625e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.224710 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 9,166,209,661 cycles # 2.837 GHz - 16,710,284,997 instructions # 1.82 insn per cycle - 3.231713030 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 2.623723 sec + 7,304,049,284 cycles # 2.778 GHz + 14,726,726,525 instructions # 2.02 insn per cycle + 2.630269239 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2078) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.333210e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.469405e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.469405e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.438003e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.763905e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.763905e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.205451 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 7,432,634,328 cycles # 2.315 GHz - 14,074,642,515 instructions # 1.89 insn per cycle - 3.212353581 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 3.095394 sec + 7,047,884,451 cycles # 2.273 GHz + 12,784,032,160 instructions # 1.81 insn per cycle + 3.101946602 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1554) (512y: 0) (512z: 1309) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052638724330E-002 +Relative difference = 2.0569956691141665e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 3c8228d85b..0c2fad6cf5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:49:18 +DATE: 2025-09-24_09:13:58 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.233592e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.244967e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.184868e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.359598e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.131777e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186519e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.220966 sec -INFO: No Floating Point Exceptions have been reported - 4,183,681,416 cycles # 2.867 GHz - 6,662,508,205 instructions # 1.59 insn per cycle - 1.516447212 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.272802 sec + 4,352,229,517 cycles # 2.840 GHz + 6,942,191,399 instructions # 1.60 insn per cycle + 1.588683612 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "diagram1": launch__registers_per_thread 40 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.080178e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.275874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275874e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.002593e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.173999e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.173999e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.537773 sec -INFO: No Floating Point Exceptions have been reported - 19,269,764,932 cycles # 2.946 GHz - 45,190,617,795 instructions # 2.35 insn per cycle - 6.543013626 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.000453 sec + 20,098,479,833 cycles # 2.870 GHz + 47,788,547,293 instructions # 2.38 insn per cycle + 7.005540408 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 414) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.263942e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.453881e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.453881e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.994510e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.919066e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.919066e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.487545 sec -INFO: No Floating Point Exceptions have been reported - 10,298,424,695 cycles # 2.949 GHz - 22,355,388,978 instructions # 2.17 insn per cycle - 3.493059791 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.881179 sec + 11,140,538,881 cycles # 2.868 GHz + 24,759,550,994 instructions # 2.22 insn per cycle + 3.886423602 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039330997854E-002 +Relative difference = 5.215154825545255e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.406924e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.701531e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.701531e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.035056e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.439882e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.439882e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.322767 sec -INFO: No Floating Point Exceptions have been reported - 9,443,809,325 cycles # 2.838 GHz - 15,664,102,195 instructions # 1.66 insn per cycle - 3.328357008 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 2.795045 sec + 7,779,386,268 cycles # 2.779 GHz + 13,765,224,553 instructions # 1.77 insn per cycle + 2.800026900 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2178) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.446360e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.803645e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.803645e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.124621e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.719457e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.719457e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.287328 sec -INFO: No Floating Point Exceptions have been reported - 9,371,124,961 cycles # 2.847 GHz - 15,299,944,141 instructions # 1.63 insn per cycle - 3.292839828 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 2.744834 sec + 7,614,459,531 cycles # 2.770 GHz + 13,352,845,576 instructions # 1.75 insn per cycle + 2.749922574 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2078) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.466708e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.777222e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.777222e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.573065e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.100608e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.100608e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.269312 sec -INFO: No Floating Point Exceptions have been reported - 7,659,274,117 cycles # 2.340 GHz - 12,573,895,764 instructions # 1.64 insn per cycle - 3.274843213 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 3.180653 sec + 7,312,343,971 cycles # 2.297 GHz + 11,323,410,817 instructions # 1.55 insn per cycle + 3.185765312 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1554) (512y: 0) (512z: 1309) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052638724330E-002 +Relative difference = 2.0569956691141665e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 7f30dafdfd..912bbfb1fe 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:46:29 +DATE: 2025-09-24_09:09:56 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.282321e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.333955e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.369324e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.383056e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.141193e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.196294e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.867789 sec -INFO: No Floating Point Exceptions have been reported - 3,167,199,789 cycles # 2.899 GHz - 6,506,216,930 instructions # 2.05 insn per cycle - 1.149942283 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.930327 sec + 3,356,938,087 cycles # 2.833 GHz + 6,810,076,509 instructions # 2.03 insn per cycle + 1.242803897 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst +==PROF== Profiling "diagram1": launch__registers_per_thread 40 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.085219e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281583e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281583e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.003317e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175897e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.175897e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.166056 sec -INFO: No Floating Point Exceptions have been reported - 18,234,644,828 cycles # 2.955 GHz - 45,008,398,832 instructions # 2.47 insn per cycle - 6.171760600 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.648480 sec + 19,101,943,380 cycles # 2.872 GHz + 47,607,190,590 instructions # 2.49 insn per cycle + 6.653713406 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 414) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.256894e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.462086e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.462086e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.002586e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.913648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.913648e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.159870 sec -INFO: No Floating Point Exceptions have been reported - 9,347,982,513 cycles # 2.954 GHz - 22,275,896,372 instructions # 2.38 insn per cycle - 3.165402193 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.525616 sec + 10,120,619,743 cycles # 2.867 GHz + 24,678,227,678 instructions # 2.44 insn per cycle + 3.530850186 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039330997854E-002 +Relative difference = 5.215154825545255e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.410366e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.712636e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.712636e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.030738e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.420413e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.420413e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.983322 sec -INFO: No Floating Point Exceptions have been reported - 8,463,194,185 cycles # 2.833 GHz - 15,755,395,679 instructions # 1.86 insn per cycle - 2.988746216 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 2.452320 sec + 6,811,344,176 cycles # 2.773 GHz + 13,854,164,765 instructions # 2.03 insn per cycle + 2.457451990 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2178) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.454105e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.801490e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.801490e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.150584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.795938e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.795938e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.933599 sec -INFO: No Floating Point Exceptions have been reported - 8,319,397,972 cycles # 2.832 GHz - 15,593,973,322 instructions # 1.87 insn per cycle - 2.939101584 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 2.374395 sec + 6,591,965,501 cycles # 2.771 GHz + 13,642,240,290 instructions # 2.07 insn per cycle + 2.379679090 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2078) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.469652e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.768397e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.768397e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.575282e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.089269e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.089269e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.922384 sec -INFO: No Floating Point Exceptions have been reported - 6,636,368,959 cycles # 2.267 GHz - 12,865,256,567 instructions # 1.94 insn per cycle - 2.927905791 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 2.822750 sec + 6,268,766,960 cycles # 2.218 GHz + 11,613,826,957 instructions # 1.85 insn per cycle + 2.828064655 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1554) (512y: 0) (512z: 1309) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052638724330E-002 +Relative difference = 2.0569956691141665e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index e2ecb9b5fd..0463fd0c8a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,238 +10,219 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:43:42 +DATE: 2025-09-24_09:06:05 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.979354e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.311142e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.251832e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.112738e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.116501e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.169674e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.493081 sec -INFO: No Floating Point Exceptions have been reported - 5,009,051,141 cycles # 2.916 GHz - 9,204,393,500 instructions # 1.84 insn per cycle - 1.774548277 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +TOTAL : 1.583873 sec + 5,234,229,196 cycles # 2.846 GHz + 9,549,805,518 instructions # 1.82 insn per cycle + 1.896392697 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram1": launch__registers_per_thread 40 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.077151e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.276926e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276926e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.002184e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.173237e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.173237e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.211513 sec -INFO: No Floating Point Exceptions have been reported - 18,299,232,198 cycles # 2.944 GHz - 45,005,768,829 instructions # 2.46 insn per cycle - 6.217115880 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 411) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.663048 sec + 19,100,313,751 cycles # 2.867 GHz + 47,607,216,586 instructions # 2.49 insn per cycle + 6.669193607 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 414) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.268380e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.460029e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.460029e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.002286e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.921109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.921109e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.148224 sec -INFO: No Floating Point Exceptions have been reported - 9,293,240,022 cycles # 2.948 GHz - 22,275,553,802 instructions # 2.40 insn per cycle - 3.153857529 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.525124 sec + 10,134,330,212 cycles # 2.871 GHz + 24,677,538,220 instructions # 2.44 insn per cycle + 3.530302654 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039330997854E-002 +Relative difference = 5.215154825545255e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.395770e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675698e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675698e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.022088e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.399929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.399929e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.994942 sec -INFO: No Floating Point Exceptions have been reported - 8,447,981,393 cycles # 2.817 GHz - 15,754,576,494 instructions # 1.86 insn per cycle - 3.000419944 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2565) (512y: 0) (512z: 0) +TOTAL : 2.457277 sec + 6,793,614,730 cycles # 2.760 GHz + 13,854,443,684 instructions # 2.04 insn per cycle + 2.462524480 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2178) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.419912e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.751119e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.751119e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.144844e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.775686e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.775686e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.971435 sec -INFO: No Floating Point Exceptions have been reported - 8,357,800,499 cycles # 2.808 GHz - 15,594,139,449 instructions # 1.87 insn per cycle - 2.977163262 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2462) (512y: 12) (512z: 0) +TOTAL : 2.377475 sec + 6,596,944,634 cycles # 2.770 GHz + 13,642,560,323 instructions # 2.07 insn per cycle + 2.382462188 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2078) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.455367e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.730952e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.730952e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.581109e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109083e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109083e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.933639 sec -INFO: No Floating Point Exceptions have been reported - 6,669,997,057 cycles # 2.271 GHz - 12,867,351,511 instructions # 1.93 insn per cycle - 2.938851588 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 16) (512z: 1440) +TOTAL : 2.817117 sec + 6,281,174,018 cycles # 2.226 GHz + 11,613,521,783 instructions # 1.85 insn per cycle + 2.822306157 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1554) (512y: 0) (512z: 1309) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052638724330E-002 +Relative difference = 2.0569956691141665e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 9e915de581..895d76fe30 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:57:23 +DATE: 2025-09-24_07:44:14 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.310707e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.890276e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.030864e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.359826e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.021023e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056478e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.577005 sec -INFO: No Floating Point Exceptions have been reported - 2,340,023,876 cycles # 2.880 GHz - 3,638,052,704 instructions # 1.55 insn per cycle - 0.886148283 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 79 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.631340 sec + 2,532,233,300 cycles # 2.830 GHz + 3,970,550,794 instructions # 1.57 insn per cycle + 0.951025749 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 40 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 40 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.074456e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.269687e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269687e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.000634e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.172912e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172912e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.244443 sec -INFO: No Floating Point Exceptions have been reported - 18,377,232,357 cycles # 2.941 GHz - 45,025,324,964 instructions # 2.45 insn per cycle - 6.253002386 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 397) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.666988 sec + 19,115,716,767 cycles # 2.867 GHz + 47,603,231,786 instructions # 2.49 insn per cycle + 6.671571279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 400) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.251309e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.439034e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.439034e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995961e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.913123e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.913123e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.184453 sec -INFO: No Floating Point Exceptions have been reported - 9,383,250,913 cycles # 2.940 GHz - 22,280,358,761 instructions # 2.37 insn per cycle - 3.194375038 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1935) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.534067 sec + 10,148,756,573 cycles # 2.868 GHz + 24,677,225,829 instructions # 2.43 insn per cycle + 3.539611093 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2223) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039330997854E-002 +Relative difference = 5.215154825545255e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.403334e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.700033e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.700033e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.044338e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.456169e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.456169e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.004384 sec -INFO: No Floating Point Exceptions have been reported - 8,513,730,278 cycles # 2.827 GHz - 15,791,909,505 instructions # 1.85 insn per cycle - 3.013283160 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2540) (512y: 0) (512z: 0) +TOTAL : 2.445579 sec + 6,775,772,019 cycles # 2.766 GHz + 13,851,023,344 instructions # 2.04 insn per cycle + 2.450357325 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2150) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.444935e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.799463e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.799463e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.145847e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.775490e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.775490e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.959849 sec -INFO: No Floating Point Exceptions have been reported - 8,395,161,248 cycles # 2.830 GHz - 15,634,676,534 instructions # 1.86 insn per cycle - 2.968734397 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 10) (512z: 0) +TOTAL : 2.378684 sec + 6,605,443,400 cycles # 2.772 GHz + 13,641,787,060 instructions # 2.07 insn per cycle + 2.384046135 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2048) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053228076897E-002 +Relative difference = 2.5164205752759426e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.454317e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.767111e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.767111e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.572831e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.087699e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.087699e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.955150 sec -INFO: No Floating Point Exceptions have been reported - 6,701,822,130 cycles # 2.263 GHz - 12,886,633,037 instructions # 1.92 insn per cycle - 2.963931226 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1669) (512y: 16) (512z: 1427) +TOTAL : 2.828624 sec + 6,290,623,334 cycles # 2.221 GHz + 11,612,779,469 instructions # 1.85 insn per cycle + 2.833896598 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1524) (512y: 0) (512z: 1309) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052564145764E-002 -Relative difference = 1.9988585667912256e-07 +Avg ME (F77/C++) = 1.2828052638724330E-002 +Relative difference = 2.0569956691141665e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 1fabc46555..c7a807c8b1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:27:46 +DATE: 2025-09-24_08:44:57 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.309386e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.516838e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.621181e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.330596e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.123948e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177971e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.581015 sec -INFO: No Floating Point Exceptions have been reported - 2,337,717,863 cycles # 2.893 GHz - 3,666,959,770 instructions # 1.57 insn per cycle - 0.866189287 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 109 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.636606 sec + 2,522,939,985 cycles # 2.819 GHz + 3,982,740,936 instructions # 1.58 insn per cycle + 0.955499259 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 40 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.617887e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.109367e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.109367e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.115184e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.332866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.332866e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.275933 sec -INFO: No Floating Point Exceptions have been reported - 12,412,341,686 cycles # 2.900 GHz - 32,352,281,163 instructions # 2.61 insn per cycle - 4.283041784 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 290) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.020502 sec + 17,288,157,810 cycles # 2.870 GHz + 43,112,026,250 instructions # 2.49 insn per cycle + 6.025655106 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039840314887E-002 -Relative difference = 1.244813035273009e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.642717e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.471061e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.471061e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.160220e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.264225e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.264225e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.775228 sec -INFO: No Floating Point Exceptions have been reported - 8,161,861,180 cycles # 2.934 GHz - 18,732,698,985 instructions # 2.30 insn per cycle - 2.782796507 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1534) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.296022 sec + 9,453,035,600 cycles # 2.865 GHz + 22,254,994,493 instructions # 2.35 insn per cycle + 3.301184224 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2055) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039283704129E-002 -Relative difference = 5.583829420356249e-08 +Avg ME (F77/C++) = 1.2828039330997854E-002 +Relative difference = 5.215154825545255e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.771950e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.635210e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.635210e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.147395e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.794407e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.794407e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.653741 sec -INFO: No Floating Point Exceptions have been reported - 7,565,022,779 cycles # 2.844 GHz - 14,293,093,213 instructions # 1.89 insn per cycle - 2.661141426 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2234) (512y: 0) (512z: 0) +TOTAL : 2.374550 sec + 6,593,450,184 cycles # 2.772 GHz + 12,791,908,931 instructions # 1.94 insn per cycle + 2.379938233 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1955) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053249904769E-002 +Relative difference = 2.5334363125411937e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.799741e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.762487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.762487e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.287553e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.275464e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.275464e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.634363 sec -INFO: No Floating Point Exceptions have been reported - 7,504,285,407 cycles # 2.842 GHz - 13,994,355,792 instructions # 1.86 insn per cycle - 2.641913370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2087) (512y: 3) (512z: 0) +TOTAL : 2.292894 sec + 6,391,667,866 cycles # 2.782 GHz + 12,449,157,303 instructions # 1.95 insn per cycle + 2.298194570 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1817) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053249904769E-002 +Relative difference = 2.5334363125411937e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.507958e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.890935e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.890935e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.627084e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.223395e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223395e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.900923 sec -INFO: No Floating Point Exceptions have been reported - 6,641,718,947 cycles # 2.284 GHz - 13,481,348,782 instructions # 2.03 insn per cycle - 2.908502130 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2073) (512y: 1) (512z: 1201) +TOTAL : 2.779339 sec + 6,199,704,984 cycles # 2.227 GHz + 11,064,577,835 instructions # 1.78 insn per cycle + 2.784571382 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1175) (512y: 0) (512z: 1267) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052562326775E-002 -Relative difference = 1.997440588685788e-07 +Avg ME (F77/C++) = 1.2828052595068584E-002 +Relative difference = 2.0229641945836646e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index ddc690e546..217e31f76a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_09:28:10 +DATE: 2025-09-24_08:45:27 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.311525e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.893939e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.130206e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.890476e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.949885e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.037118e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.580982 sec -INFO: No Floating Point Exceptions have been reported - 2,326,498,884 cycles # 2.887 GHz - 3,595,400,053 instructions # 1.55 insn per cycle - 0.865243472 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 79 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.647806 sec + 2,556,041,252 cycles # 2.823 GHz + 3,995,093,038 instructions # 1.56 insn per cycle + 0.964151817 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 40 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 40 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/GPU) = 1.2828112132410752E-002 +Relative difference = 7.1821224749348815e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.199736e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.210916e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.210916e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.130758e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.354394e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.354394e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.247253 sec -INFO: No Floating Point Exceptions have been reported - 9,460,485,661 cycles # 2.907 GHz - 25,749,028,052 instructions # 2.72 insn per cycle - 3.254869601 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 243) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.943582 sec + 17,072,881,150 cycles # 2.871 GHz + 42,123,289,663 instructions # 2.47 insn per cycle + 5.948757824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 386) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039838495897E-002 -Relative difference = 1.2589928273811243e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.982142e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.480555e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.480555e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.181674e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.305409e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.305409e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.498717 sec -INFO: No Floating Point Exceptions have been reported - 7,385,528,393 cycles # 2.949 GHz - 16,812,365,380 instructions # 2.28 insn per cycle - 2.506313604 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1311) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.266811 sec + 9,398,737,688 cycles # 2.874 GHz + 22,004,483,420 instructions # 2.34 insn per cycle + 3.271919523 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2024) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039330997854E-002 +Relative difference = 5.215154825545255e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.917887e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.065921e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.065921e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.139939e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.774728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.774728e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.542096 sec -INFO: No Floating Point Exceptions have been reported - 7,260,793,625 cycles # 2.848 GHz - 13,703,433,227 instructions # 1.89 insn per cycle - 2.549878549 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2067) (512y: 0) (512z: 0) +TOTAL : 2.381212 sec + 6,591,131,680 cycles # 2.763 GHz + 12,722,446,784 instructions # 1.93 insn per cycle + 2.386360015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1900) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 +Avg ME (F77/C++) = 1.2828053264456685E-002 +Relative difference = 2.5447801373846945e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.947392e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.166768e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.166768e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.297363e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.310074e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.310074e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.537410 sec -INFO: No Floating Point Exceptions have been reported - 7,253,478,894 cycles # 2.851 GHz - 13,505,585,795 instructions # 1.86 insn per cycle - 2.545044336 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1935) (512y: 7) (512z: 0) +TOTAL : 2.286925 sec + 6,344,873,106 cycles # 2.769 GHz + 12,379,326,234 instructions # 1.95 insn per cycle + 2.292021921 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1762) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 +Avg ME (F77/C++) = 1.2828053264456685E-002 +Relative difference = 2.5447801373846945e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.612725e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.139660e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.139660e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.631209e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.246457e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.246457e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.798296 sec -INFO: No Floating Point Exceptions have been reported - 6,447,529,861 cycles # 2.298 GHz - 13,215,855,857 instructions # 2.05 insn per cycle - 2.806480502 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2039) (512y: 2) (512z: 1081) +TOTAL : 2.775253 sec + 6,198,296,648 cycles # 2.230 GHz + 11,034,050,222 instructions # 1.78 insn per cycle + 2.780602894 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1127) (512y: 0) (512z: 1264) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052536860923E-002 -Relative difference = 1.977588895209662e-07 +Avg ME (F77/C++) = 1.2828052620534436E-002 +Relative difference = 2.0428158880597908e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..f6314e4b0d --- /dev/null +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +DATE: 2025-09-24_08:16:12 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.719679e+06 1 256 +3.735508e+06 2 256 +7.293707e+06 4 256 +1.360978e+07 8 256 +2.685655e+07 16 256 +4.383703e+07 32 256 +4.540856e+07 64 256 +4.950126e+07 128 256 +5.330683e+07 256 256 +5.631852e+07 512 256 +5.657810e+07 1024 256 +### GPU: scaling test 32 +3.006756e+05 1 32 +6.061812e+05 2 32 +1.142113e+06 4 32 +2.056274e+06 8 32 +3.922139e+06 16 32 +8.505902e+06 32 32 +1.586109e+07 64 32 +2.915572e+07 128 32 +4.360323e+07 256 32 +4.646757e+07 512 32 +4.804819e+07 1024 32 +5.207345e+07 2048 32 +5.450514e+07 4096 32 +5.497340e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.775358e+05 1 256 +9.632153e+05 2 256 +1.050268e+06 4 256 +### CPU: scaling test 32 +9.325368e+05 1 32 +5.475093e+05 2 32 +1.048519e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.655394e+06 1 256 +1.632898e+06 2 256 +1.597165e+06 4 256 +### CPU: scaling test 32 +1.718859e+06 1 32 +1.408389e+06 2 32 +1.612985e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.947611e+06 1 256 +2.989467e+06 2 256 +2.930193e+06 4 256 +### CPU: scaling test 32 +2.413455e+06 1 32 +2.808742e+06 2 32 +2.888087e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.652163e+06 1 256 +3.010862e+06 2 256 +3.216323e+06 4 256 +### CPU: scaling test 32 +2.566570e+06 1 32 +2.745603e+06 2 32 +3.084635e+06 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.854589e+06 1 256 +1.864150e+06 2 256 +1.940552e+06 4 256 +### CPU: scaling test 32 +1.188707e+06 1 32 +1.480762e+06 2 32 +1.728398e+06 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 8e00f9820d..61423e69fe 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:55:54 +DATE: 2025-09-24_07:42:27 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.055673e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.658424e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.851508e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.608721e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.979951e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.218009e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.678781 sec -INFO: No Floating Point Exceptions have been reported - 2,628,768,348 cycles # 2.876 GHz - 4,103,389,790 instructions # 1.56 insn per cycle - 1.044225431 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.760428 sec + 2,903,810,857 cycles # 2.835 GHz + 4,637,834,708 instructions # 1.60 insn per cycle + 1.081616389 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 92 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039901590279E-002 Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.011376e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175905e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.175905e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.680084e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.118472e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.118472e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.671756 sec -INFO: No Floating Point Exceptions have been reported - 19,661,999,702 cycles # 2.943 GHz - 46,395,546,050 instructions # 2.36 insn per cycle - 6.683261433 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.917089 sec + 19,877,690,092 cycles # 2.872 GHz + 48,645,342,412 instructions # 2.45 insn per cycle + 6.922363261 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 436) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.631538e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.161697e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.161697e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.456171e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.869473e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.869473e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.308846 sec -INFO: No Floating Point Exceptions have been reported - 12,713,127,116 cycles # 2.944 GHz - 31,571,564,120 instructions # 2.48 insn per cycle - 4.322869208 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1731) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.742828 sec + 13,638,147,605 cycles # 2.873 GHz + 35,198,976,108 instructions # 2.58 insn per cycle + 4.748594895 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1837) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.963768e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.746755e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.746755e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.037647e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.902549e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.902549e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.649356 sec -INFO: No Floating Point Exceptions have been reported - 10,294,572,937 cycles # 2.814 GHz - 19,586,622,017 instructions # 1.90 insn per cycle - 3.662289672 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2045) (512y: 0) (512z: 0) +TOTAL : 3.506028 sec + 9,616,164,024 cycles # 2.739 GHz + 18,694,922,165 instructions # 1.94 insn per cycle + 3.511882526 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1921) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.001856e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.818080e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.818080e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.162409e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.131666e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.131666e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.589952 sec -INFO: No Floating Point Exceptions have been reported - 10,108,826,304 cycles # 2.808 GHz - 19,396,692,714 instructions # 1.92 insn per cycle - 3.602641354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1799) (512y: 188) (512z: 0) +TOTAL : 3.325331 sec + 9,147,653,531 cycles # 2.747 GHz + 18,304,911,433 instructions # 2.00 insn per cycle + 3.331056713 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1798) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.801777e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.420597e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.420597e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.529227e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.959559e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959559e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.939332 sec -INFO: No Floating Point Exceptions have been reported - 8,555,878,739 cycles # 2.167 GHz - 15,216,666,169 instructions # 1.78 insn per cycle - 3.951287451 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 966) (512y: 154) (512z: 1330) +TOTAL : 4.530804 sec + 9,205,145,539 cycles # 2.030 GHz + 14,289,269,729 instructions # 1.55 insn per cycle + 4.536419747 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1189) (512y: 40) (512z: 1209) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 0283d4438d..baab060c45 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-10-06_08:56:25 +DATE: 2025-09-24_07:43:04 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.048170e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.671940e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.867900e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.499014e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.687427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.902055e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677955 sec -INFO: No Floating Point Exceptions have been reported - 2,610,429,449 cycles # 2.847 GHz - 4,074,904,816 instructions # 1.56 insn per cycle - 1.028610198 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.769299 sec + 2,927,220,098 cycles # 2.834 GHz + 4,600,691,724 instructions # 1.57 insn per cycle + 1.091683776 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 52 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 80 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039901590279E-002 Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.012794e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.178467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.178467e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.625912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111912e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111912e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.662178 sec -INFO: No Floating Point Exceptions have been reported - 19,608,707,308 cycles # 2.939 GHz - 46,331,953,932 instructions # 2.36 insn per cycle - 6.674225175 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 453) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.954919 sec + 19,977,286,420 cycles # 2.871 GHz + 48,643,751,431 instructions # 2.43 insn per cycle + 6.960548322 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 426) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.631371e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.156116e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.156116e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.457537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.870054e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.870054e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.305744 sec -INFO: No Floating Point Exceptions have been reported - 12,687,194,497 cycles # 2.940 GHz - 31,570,654,619 instructions # 2.49 insn per cycle - 4.317357131 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1724) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.739558 sec + 13,621,642,189 cycles # 2.873 GHz + 35,197,065,818 instructions # 2.58 insn per cycle + 4.744878748 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.951503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.723168e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.723168e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.047862e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.919113e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.919113e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.669508 sec -INFO: No Floating Point Exceptions have been reported - 10,337,023,986 cycles # 2.809 GHz - 19,600,398,756 instructions # 1.90 insn per cycle - 3.680210311 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2036) (512y: 0) (512z: 0) +TOTAL : 3.489609 sec + 9,585,163,191 cycles # 2.743 GHz + 18,694,267,994 instructions # 1.95 insn per cycle + 3.495235656 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1903) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.000628e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.813640e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.813640e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.160459e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.127988e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.127988e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.591164 sec -INFO: No Floating Point Exceptions have been reported - 10,093,463,938 cycles # 2.804 GHz - 19,298,137,282 instructions # 1.91 insn per cycle - 3.601580555 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1766) (512y: 191) (512z: 0) +TOTAL : 3.326756 sec + 9,154,692,273 cycles # 2.749 GHz + 18,313,995,483 instructions # 2.00 insn per cycle + 3.332018076 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1778) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.833398e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483164e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483164e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.531353e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.968559e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.968559e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.878021 sec -INFO: No Floating Point Exceptions have been reported - 8,399,559,009 cycles # 2.161 GHz - 15,073,176,103 instructions # 1.79 insn per cycle - 3.888708235 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 959) (512y: 155) (512z: 1296) +TOTAL : 4.525099 sec + 9,221,463,827 cycles # 2.036 GHz + 14,289,434,484 instructions # 1.55 insn per cycle + 4.530688944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1169) (512y: 40) (512z: 1209) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..ef99d18fee --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:16:54 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +7.891541e+05 1 256 +1.602544e+06 2 256 +3.143940e+06 4 256 +6.117505e+06 8 256 +1.006415e+07 16 256 +1.015524e+07 32 256 +9.869606e+06 64 256 +1.025593e+07 128 256 +1.045621e+07 256 256 +1.042857e+07 512 256 +1.045995e+07 1024 256 +### GPU: scaling test 32 +1.055482e+05 1 32 +2.141363e+05 2 32 +4.117622e+05 4 32 +8.094196e+05 8 32 +1.660171e+06 16 32 +3.251857e+06 32 32 +6.412223e+06 64 32 +1.071040e+07 128 32 +1.040890e+07 256 32 +9.723823e+06 512 32 +1.003785e+07 1024 32 +1.020642e+07 2048 32 +1.030114e+07 4096 32 +1.031146e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.682639e+05 1 256 +1.744753e+05 2 256 +1.760606e+05 4 256 +### CPU: scaling test 32 +1.477098e+05 1 32 +1.544446e+05 2 32 +1.672586e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.878371e+05 1 256 +2.981327e+05 2 256 +2.993239e+05 4 256 +### CPU: scaling test 32 +2.562542e+05 1 32 +2.522585e+05 2 32 +2.636593e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.122900e+05 1 256 +5.152419e+05 2 256 +5.057715e+05 4 256 +### CPU: scaling test 32 +4.742919e+05 1 32 +5.186007e+05 2 32 +5.219122e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.908879e+05 1 256 +4.835691e+05 2 256 +5.081233e+05 4 256 +### CPU: scaling test 32 +5.020002e+05 1 32 +4.694044e+05 2 32 +5.132071e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.468816e+05 1 256 +3.430523e+05 2 256 +3.448120e+05 4 256 +### CPU: scaling test 32 +3.438642e+05 1 32 +3.306639e+05 2 32 +3.398922e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 0abecbd859..12554813ea 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:57:50 +DATE: 2025-09-24_07:44:46 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.424562e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.378226e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000814e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.404580e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049333e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056492e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.532719 sec -INFO: No Floating Point Exceptions have been reported - 2,198,564,055 cycles # 2.860 GHz - 3,137,529,593 instructions # 1.43 insn per cycle - 0.850854779 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.628724 sec + 2,721,117,292 cycles # 2.810 GHz + 4,339,502,991 instructions # 1.59 insn per cycle + 1.033254992 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.821542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.869016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.869016e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.729417e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.772089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.772089e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.902388 sec -INFO: No Floating Point Exceptions have been reported - 17,373,663,633 cycles # 2.939 GHz - 46,051,346,456 instructions # 2.65 insn per cycle - 5.916149203 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.171630 sec + 17,456,416,634 cycles # 2.826 GHz + 46,971,345,188 instructions # 2.69 insn per cycle + 6.177518078 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.199984e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.364044e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.364044e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.953599e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.091735e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.091735e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.425369 sec -INFO: No Floating Point Exceptions have been reported - 10,116,123,100 cycles # 2.945 GHz - 27,968,506,728 instructions # 2.76 insn per cycle - 3.436971917 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.659325 sec + 10,530,690,716 cycles # 2.874 GHz + 29,287,857,198 instructions # 2.78 insn per cycle + 3.665244431 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.021241e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.422127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.422127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.768889e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.129034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.129034e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.236686 sec -INFO: No Floating Point Exceptions have been reported - 6,226,726,050 cycles # 2.773 GHz - 12,700,169,832 instructions # 2.04 insn per cycle - 2.249020906 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.307581 sec + 6,215,258,548 cycles # 2.688 GHz + 12,525,382,616 instructions # 2.02 insn per cycle + 2.313431050 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2736) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.518459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.996461e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.996461e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.044890e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.444572e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.444572e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.046768 sec -INFO: No Floating Point Exceptions have been reported - 5,709,909,658 cycles # 2.777 GHz - 12,140,194,379 instructions # 2.13 insn per cycle - 2.059786524 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.187457 sec + 5,882,745,324 cycles # 2.683 GHz + 12,185,179,748 instructions # 2.07 insn per cycle + 2.193247103 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2586) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.403513e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.583329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.583329e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.306178e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.469286e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.469286e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.229693 sec -INFO: No Floating Point Exceptions have been reported - 6,051,702,488 cycles # 1.869 GHz - 8,428,750,265 instructions # 1.39 insn per cycle - 3.242969033 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.279663 sec + 5,830,954,541 cycles # 1.775 GHz + 7,895,679,026 instructions # 1.35 insn per cycle + 3.285445110 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 65) (512z: 1901) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..c8cf787c12 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:36:22 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.202700e+05 1 256 +4.144491e+05 2 256 +8.179658e+05 4 256 +1.562332e+06 8 256 +2.892629e+06 16 256 +4.298762e+06 32 256 +4.894852e+06 64 256 +5.444492e+06 128 256 +5.683171e+06 256 256 +5.820623e+06 512 256 +5.919521e+06 1024 256 +### GPU: scaling test 32 +2.776014e+04 1 32 +5.644799e+04 2 32 +1.100256e+05 4 32 +2.252125e+05 8 32 +4.345424e+05 16 32 +8.321232e+05 32 32 +1.586046e+06 64 32 +2.908073e+06 128 32 +4.301308e+06 256 32 +4.913299e+06 512 32 +5.390513e+06 1024 32 +5.633664e+06 2048 32 +5.787401e+06 4096 32 +5.878205e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.696941e+05 1 256 +1.717741e+05 2 256 +1.790509e+05 4 256 +### CPU: scaling test 32 +1.689002e+05 1 32 +1.571127e+05 2 32 +1.541879e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.839038e+05 1 256 +2.913499e+05 2 256 +2.993763e+05 4 256 +### CPU: scaling test 32 +2.509115e+05 1 32 +2.754181e+05 2 32 +2.843895e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.154235e+05 1 256 +5.028185e+05 2 256 +4.765561e+05 4 256 +### CPU: scaling test 32 +4.730649e+05 1 32 +5.173097e+05 2 32 +4.703842e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.070823e+05 1 256 +5.471126e+05 2 256 +5.238208e+05 4 256 +### CPU: scaling test 32 +5.417393e+05 1 32 +5.505282e+05 2 32 +5.522550e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.473801e+05 1 256 +3.466467e+05 2 256 +3.476546e+05 4 256 +### CPU: scaling test 32 +3.427702e+05 1 32 +3.255838e+05 2 32 +3.435098e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..5bcf17d672 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:31:05 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.699751e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.094514e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.118806e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.755908 sec + 5,198,972,304 cycles # 2.815 GHz + 7,555,850,717 instructions # 1.45 insn per cycle + 2.202249998 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028807e+00 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.758926e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.803118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.803118e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.070096 sec + 17,462,788,649 cycles # 2.875 GHz + 46,971,670,398 instructions # 2.69 insn per cycle + 6.075826013 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.950618e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.089205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.089205e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.663339 sec + 10,527,655,944 cycles # 2.870 GHz + 29,286,438,599 instructions # 2.78 insn per cycle + 3.668929376 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.773688e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.129278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.129278e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.305290 sec + 6,201,815,154 cycles # 2.685 GHz + 12,525,642,665 instructions # 2.02 insn per cycle + 2.310979411 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2736) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.054242e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.447760e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.447760e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.182991 sec + 5,886,113,793 cycles # 2.691 GHz + 12,185,152,362 instructions # 2.07 insn per cycle + 2.188678888 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2586) (512y: 47) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.302489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.466219e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.466219e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.284220 sec + 5,848,328,873 cycles # 1.779 GHz + 7,896,060,647 instructions # 1.35 insn per cycle + 3.290064278 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 65) (512z: 1901) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 0a62f31f21..9191103607 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,252 +10,220 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:37:36 +DATE: 2025-09-24_08:55:18 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.523249e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.008578e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.008578e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.122019e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.976634e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.976634e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.943118 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,438,006,415 cycles # 2.887 GHz - 4,812,518,572 instructions # 1.40 insn per cycle - 1.248014993 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +TOTAL : 0.933427 sec + 3,692,889,749 cycles # 2.838 GHz + 6,132,016,544 instructions # 1.66 insn per cycle + 1.359401129 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.806787e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.852935e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.852935e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.747683e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.792018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.792018e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 6.028463 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 17,701,660,691 cycles # 2.931 GHz - 46,100,592,443 instructions # 2.60 insn per cycle - 6.041454793 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.189057 sec + 17,795,188,257 cycles # 2.873 GHz + 47,029,255,433 instructions # 2.64 insn per cycle + 6.196117113 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.171570e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.328412e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.328412e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.921364e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.058184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.058184e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.537488 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,436,410,766 cycles # 2.940 GHz - 28,150,415,987 instructions # 2.70 insn per cycle - 3.550700440 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.779219 sec + 10,873,140,033 cycles # 2.873 GHz + 29,459,756,408 instructions # 2.71 insn per cycle + 3.786041909 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.940586e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.316252e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.316252e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.690469e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.034578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.034578e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.355700 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,586,554,223 cycles # 2.781 GHz - 12,999,619,553 instructions # 1.97 insn per cycle - 2.369192751 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.428734 sec + 6,544,737,755 cycles # 2.688 GHz + 12,802,283,348 instructions # 1.96 insn per cycle + 2.435745600 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2736) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.425137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.877080e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.877080e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.968126e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.352687e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.352687e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.160954 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,058,497,746 cycles # 2.788 GHz - 12,422,408,910 instructions # 2.05 insn per cycle - 2.174009213 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.302326 sec + 6,222,182,781 cycles # 2.696 GHz + 12,460,650,523 instructions # 2.00 insn per cycle + 2.309301530 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2586) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.454260e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.633384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.633384e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.258803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.419460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.419460e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.271770 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,220,081,356 cycles # 1.894 GHz - 8,655,636,644 instructions # 1.39 insn per cycle - 3.285127387 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.410893 sec + 6,198,364,555 cycles # 1.814 GHz + 8,129,999,388 instructions # 1.31 insn per cycle + 3.417828859 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 65) (512z: 1901) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 70d02af695..04fe28e8ae 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:49:47 +DATE: 2025-09-24_09:14:34 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.202403e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.187841e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.877468e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.237181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.048925e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057217e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.637594 sec -INFO: No Floating Point Exceptions have been reported - 2,481,390,363 cycles # 2.852 GHz - 3,619,998,982 instructions # 1.46 insn per cycle - 0.928734017 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.728904 sec + 3,064,102,873 cycles # 2.835 GHz + 4,857,097,227 instructions # 1.59 insn per cycle + 1.137267130 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.808108e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.854363e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.854363e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.758915e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.803139e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.803139e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.981515 sec -INFO: No Floating Point Exceptions have been reported - 17,441,882,337 cycles # 2.914 GHz - 45,980,812,555 instructions # 2.64 insn per cycle - 5.987317462 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.130886 sec + 17,629,078,696 cycles # 2.874 GHz + 46,986,708,443 instructions # 2.67 insn per cycle + 6.136336124 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.173867e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.332553e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.332553e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.948603e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.087576e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.087576e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.490197 sec -INFO: No Floating Point Exceptions have been reported - 10,215,611,800 cycles # 2.923 GHz - 27,889,324,001 instructions # 2.73 insn per cycle - 3.495993800 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.725904 sec + 10,708,180,208 cycles # 2.871 GHz + 29,285,168,892 instructions # 2.73 insn per cycle + 3.731175047 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.999819e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.389873e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.389873e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.743537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.097973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.097973e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.281339 sec -INFO: No Floating Point Exceptions have been reported - 6,287,168,374 cycles # 2.750 GHz - 12,602,929,813 instructions # 2.00 insn per cycle - 2.287435325 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.381248 sec + 6,381,646,347 cycles # 2.676 GHz + 12,508,043,189 instructions # 1.96 insn per cycle + 2.386552428 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2736) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.471434e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.936245e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.936245e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.071295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.472287e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.472287e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.097327 sec -INFO: No Floating Point Exceptions have been reported - 5,814,420,150 cycles # 2.765 GHz - 11,994,829,914 instructions # 2.06 insn per cycle - 2.103345298 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.238244 sec + 6,056,118,225 cycles # 2.701 GHz + 12,131,985,515 instructions # 2.00 insn per cycle + 2.243511255 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2586) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.462865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.641783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.641783e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.313116e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.477716e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.477716e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.218108 sec -INFO: No Floating Point Exceptions have been reported - 5,937,437,503 cycles # 1.843 GHz - 8,290,568,638 instructions # 1.40 insn per cycle - 3.224462086 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.336612 sec + 6,011,176,733 cycles # 1.799 GHz + 7,844,203,228 instructions # 1.30 insn per cycle + 3.341856637 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 65) (512z: 1901) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 794a3c9310..f10f1223e6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:46:56 +DATE: 2025-09-24_09:10:28 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.311257e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.342288e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.004457e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.236174e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049477e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057927e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.566605 sec -INFO: No Floating Point Exceptions have been reported - 2,313,605,054 cycles # 2.893 GHz - 3,600,350,267 instructions # 1.56 insn per cycle - 0.856648834 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.664857 sec + 2,876,847,918 cycles # 2.837 GHz + 4,812,114,039 instructions # 1.67 insn per cycle + 1.074444340 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.824387e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.871256e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.871256e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.756610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.801195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.801195e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.855309 sec -INFO: No Floating Point Exceptions have been reported - 17,230,682,954 cycles # 2.940 GHz - 45,932,528,772 instructions # 2.67 insn per cycle - 5.861424268 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.077308 sec + 17,463,617,999 cycles # 2.872 GHz + 46,970,827,220 instructions # 2.69 insn per cycle + 6.082669851 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.215073e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.378302e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.378302e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.951168e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.089979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.089979e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.372268 sec -INFO: No Floating Point Exceptions have been reported - 9,959,367,668 cycles # 2.949 GHz - 27,848,270,798 instructions # 2.80 insn per cycle - 3.378265573 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.661548 sec + 10,532,998,621 cycles # 2.873 GHz + 29,286,182,285 instructions # 2.78 insn per cycle + 3.667011568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.999546e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.391220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.391220e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.769638e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.128119e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.128119e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.206484 sec -INFO: No Floating Point Exceptions have been reported - 6,113,930,208 cycles # 2.765 GHz - 12,581,849,902 instructions # 2.06 insn per cycle - 2.212402360 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.306780 sec + 6,205,424,366 cycles # 2.685 GHz + 12,524,741,443 instructions # 2.02 insn per cycle + 2.312117670 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2736) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.516180e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.984165e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.984165e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.071431e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.470140e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.470140e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.007710 sec -INFO: No Floating Point Exceptions have been reported - 5,576,628,773 cycles # 2.771 GHz - 12,020,299,868 instructions # 2.16 insn per cycle - 2.013581558 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.175899 sec + 5,871,123,047 cycles # 2.693 GHz + 12,184,260,539 instructions # 2.08 insn per cycle + 2.181215362 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2586) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.502286e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.687963e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.687963e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.314814e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.482786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.482786e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.102138 sec -INFO: No Floating Point Exceptions have been reported - 5,751,986,200 cycles # 1.852 GHz - 8,297,969,466 instructions # 1.44 insn per cycle - 3.107697215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.271642 sec + 5,832,726,143 cycles # 1.781 GHz + 7,894,703,614 instructions # 1.35 insn per cycle + 3.276923395 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 65) (512z: 1901) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..886c892c14 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_09:20:18 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.244710e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.048259e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056627e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.636988 sec + 2,686,906,350 cycles # 2.813 GHz + 4,300,779,901 instructions # 1.60 insn per cycle + 1.016142360 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028807e+00 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.759819e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.803941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.803941e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.066107 sec + 17,450,230,221 cycles # 2.875 GHz + 46,970,996,393 instructions # 2.69 insn per cycle + 6.071702693 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.945444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.084219e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.084219e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.668722 sec + 10,531,909,043 cycles # 2.867 GHz + 29,285,880,648 instructions # 2.78 insn per cycle + 3.673985883 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.776361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.141048e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.141048e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.303397 sec + 6,215,739,916 cycles # 2.693 GHz + 12,525,182,351 instructions # 2.02 insn per cycle + 2.308825772 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2736) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.057506e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.454426e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.454426e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.182222 sec + 5,875,940,555 cycles # 2.687 GHz + 12,184,942,026 instructions # 2.07 insn per cycle + 2.187641932 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2586) (512y: 47) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.310786e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.476862e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.476862e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.275530 sec + 5,832,927,122 cycles # 1.779 GHz + 7,895,012,180 instructions # 1.35 insn per cycle + 3.281078765 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 65) (512z: 1901) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 70a45db399..0f1925f64d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,235 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:44:10 +DATE: 2025-09-24_09:06:37 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.785807e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.291280e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.973584e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.776809e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049035e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057384e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.717756 sec -INFO: No Floating Point Exceptions have been reported - 2,755,914,027 cycles # 2.900 GHz - 4,368,405,962 instructions # 1.59 insn per cycle - 1.007006361 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +TOTAL : 0.825846 sec + 3,321,633,294 cycles # 2.833 GHz + 5,533,137,328 instructions # 1.67 insn per cycle + 1.233631459 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.829948e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.877608e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.877608e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.759080e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.803410e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.803410e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.839744 sec -INFO: No Floating Point Exceptions have been reported - 17,231,514,699 cycles # 2.948 GHz - 45,931,758,909 instructions # 2.67 insn per cycle - 5.845651027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.069385 sec + 17,452,545,633 cycles # 2.874 GHz + 46,970,678,458 instructions # 2.69 insn per cycle + 6.074915326 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.215717e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.376174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.376174e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.952210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.090467e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.090467e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.370523 sec -INFO: No Floating Point Exceptions have been reported - 9,939,666,586 cycles # 2.945 GHz - 27,847,302,489 instructions # 2.80 insn per cycle - 3.376515027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.660961 sec + 10,528,744,959 cycles # 2.872 GHz + 29,285,969,565 instructions # 2.78 insn per cycle + 3.666466666 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.058902e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.451650e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.451650e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.744356e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.100853e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.100853e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.181386 sec -INFO: No Floating Point Exceptions have been reported - 6,074,037,919 cycles # 2.778 GHz - 12,580,567,087 instructions # 2.07 insn per cycle - 2.187203017 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.319004 sec + 6,208,558,879 cycles # 2.672 GHz + 12,525,519,644 instructions # 2.02 insn per cycle + 2.324628512 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2736) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.484469e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.947491e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.947491e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.050195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.444774e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.444774e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.020942 sec -INFO: No Floating Point Exceptions have been reported - 5,589,694,694 cycles # 2.759 GHz - 12,020,772,424 instructions # 2.15 insn per cycle - 2.026934215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 144) (512z: 0) +TOTAL : 2.184551 sec + 5,886,818,346 cycles # 2.689 GHz + 12,182,424,074 instructions # 2.07 insn per cycle + 2.189855469 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2586) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.541083e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.728456e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.728456e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.306826e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.470997e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.470997e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.072814 sec -INFO: No Floating Point Exceptions have been reported - 5,724,538,871 cycles # 1.860 GHz - 8,297,304,281 instructions # 1.45 insn per cycle - 3.079169559 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 122) (512z: 1802) +TOTAL : 3.278572 sec + 5,842,492,465 cycles # 1.780 GHz + 7,895,231,616 instructions # 1.35 insn per cycle + 3.283921228 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 65) (512z: 1901) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 03be4a726d..acb6158b6b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:58:15 +DATE: 2025-09-24_07:45:19 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.508928e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.321752e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.002344e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.401233e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049117e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056279e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.536365 sec -INFO: No Floating Point Exceptions have been reported - 2,214,194,265 cycles # 2.876 GHz - 3,152,115,430 instructions # 1.42 insn per cycle - 0.834564895 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.624817 sec + 2,770,804,330 cycles # 2.829 GHz + 4,445,203,695 instructions # 1.60 insn per cycle + 1.036109264 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 98 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.855453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904405e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904405e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.817113e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.864515e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.864515e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.800560 sec -INFO: No Floating Point Exceptions have been reported - 16,903,949,090 cycles # 2.909 GHz - 45,043,853,273 instructions # 2.66 insn per cycle - 5.813534817 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.879296 sec + 16,915,750,663 cycles # 2.875 GHz + 45,827,908,987 instructions # 2.71 insn per cycle + 5.885040141 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 633) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.339712e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.518637e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.518637e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.964734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.104969e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.104969e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.288467 sec -INFO: No Floating Point Exceptions have been reported - 9,645,043,566 cycles # 2.925 GHz - 26,807,862,552 instructions # 2.78 insn per cycle - 3.301069690 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2327) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.645473 sec + 10,495,372,754 cycles # 2.875 GHz + 29,278,170,489 instructions # 2.79 insn per cycle + 3.651373133 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2913) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.590385e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.923511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.923511e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.763804e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.118520e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.118520e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.431911 sec -INFO: No Floating Point Exceptions have been reported - 6,762,097,168 cycles # 2.769 GHz - 14,239,182,198 instructions # 2.11 insn per cycle - 2.443454156 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) +TOTAL : 2.309471 sec + 6,198,243,265 cycles # 2.678 GHz + 12,520,312,586 instructions # 2.02 insn per cycle + 2.315314098 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2724) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.784038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.137564e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.137564e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.036657e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.429513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.429513e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.339078 sec -INFO: No Floating Point Exceptions have been reported - 6,493,835,738 cycles # 2.765 GHz - 13,835,177,964 instructions # 2.13 insn per cycle - 2.350490634 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2360) (512y: 298) (512z: 0) +TOTAL : 2.189307 sec + 5,886,882,915 cycles # 2.683 GHz + 12,178,604,730 instructions # 2.07 insn per cycle + 2.195089043 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2572) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.400894e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576119e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576119e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.276480e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.442165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.442165e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.231977 sec -INFO: No Floating Point Exceptions have been reported - 6,054,126,925 cycles # 1.868 GHz - 10,181,313,288 instructions # 1.68 insn per cycle - 3.245420113 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1273) (512y: 208) (512z: 1988) +TOTAL : 3.310853 sec + 5,849,370,568 cycles # 1.766 GHz + 7,892,630,392 instructions # 1.35 insn per cycle + 3.316630553 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1503) (512y: 65) (512z: 1901) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index f94c1448dd..892c6e6f77 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:28:32 +DATE: 2025-09-24_08:45:57 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.445619e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.389644e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.998797e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.203899e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049403e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057753e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.532166 sec -INFO: No Floating Point Exceptions have been reported - 2,223,705,741 cycles # 2.888 GHz - 3,137,862,648 instructions # 1.41 insn per cycle - 0.826622030 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.628394 sec + 2,751,584,432 cycles # 2.831 GHz + 4,342,569,960 instructions # 1.58 insn per cycle + 1.032472366 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.243473e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.316891e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.316891e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.884618e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.936373e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.936373e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.822701 sec -INFO: No Floating Point Exceptions have been reported - 14,262,425,677 cycles # 2.951 GHz - 34,462,229,045 instructions # 2.42 insn per cycle - 4.834685593 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 665) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.672505 sec + 16,281,774,044 cycles # 2.868 GHz + 42,306,769,598 instructions # 2.60 insn per cycle + 5.678252169 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 714) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.991823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.134338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.134338e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.119000e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273875e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.273875e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.653458 sec -INFO: No Floating Point Exceptions have been reported - 10,828,452,798 cycles # 2.955 GHz - 24,364,594,695 instructions # 2.25 insn per cycle - 3.665357624 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2610) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.470530 sec + 9,977,147,210 cycles # 2.871 GHz + 25,847,622,241 instructions # 2.59 insn per cycle + 3.476158258 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2979) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515654 -Relative difference = 3.2588039900609506e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.588361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.923011e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.923011e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.125204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.540174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.540174e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.432860 sec -INFO: No Floating Point Exceptions have been reported - 6,763,126,248 cycles # 2.768 GHz - 12,520,790,366 instructions # 1.85 insn per cycle - 2.444836798 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3115) (512y: 0) (512z: 0) +TOTAL : 2.154277 sec + 5,808,180,817 cycles # 2.691 GHz + 10,998,718,090 instructions # 1.89 insn per cycle + 2.160064381 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2666) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.983949e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.371900e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.371900e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.604062e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.095977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.095977e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.251146 sec -INFO: No Floating Point Exceptions have been reported - 6,291,656,449 cycles # 2.782 GHz - 11,662,894,163 instructions # 1.85 insn per cycle - 2.263135736 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2644) (512y: 239) (512z: 0) +TOTAL : 1.978924 sec + 5,298,127,046 cycles # 2.671 GHz + 10,086,761,200 instructions # 1.90 insn per cycle + 1.984504704 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2375) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.728872e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.941749e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.941749e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.577753e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.771603e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.771603e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.960781 sec -INFO: No Floating Point Exceptions have been reported - 5,563,913,804 cycles # 1.872 GHz - 9,412,295,126 instructions # 1.69 insn per cycle - 2.972906161 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2099) (512y: 282) (512z: 1958) +TOTAL : 3.039803 sec + 5,439,957,246 cycles # 1.787 GHz + 6,988,231,171 instructions # 1.28 insn per cycle + 3.045430879 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1363) (512y: 57) (512z: 1812) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 3c1647789f..9240e55d7f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:28:57 +DATE: 2025-09-24_08:46:29 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.391002e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.323919e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.976474e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.185938e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045496e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053733e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534058 sec -INFO: No Floating Point Exceptions have been reported - 2,225,875,951 cycles # 2.883 GHz - 3,143,824,990 instructions # 1.41 insn per cycle - 0.828954123 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.627663 sec + 2,749,687,654 cycles # 2.830 GHz + 4,365,506,275 instructions # 1.59 insn per cycle + 1.031393421 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 98 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/GPU) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.586147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.682611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.682611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.884133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.935469e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935469e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.200138 sec -INFO: No Floating Point Exceptions have been reported - 12,457,576,414 cycles # 2.958 GHz - 35,030,140,380 instructions # 2.81 insn per cycle - 4.211834896 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 430) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.673558 sec + 16,293,550,728 cycles # 2.870 GHz + 41,861,740,674 instructions # 2.57 insn per cycle + 5.679192911 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.003695e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.145378e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.145378e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.151299e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.310989e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.310989e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.637171 sec -INFO: No Floating Point Exceptions have been reported - 10,771,658,335 cycles # 2.953 GHz - 23,459,809,146 instructions # 2.18 insn per cycle - 3.648522280 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2378) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.435956 sec + 9,881,633,737 cycles # 2.872 GHz + 25,645,447,655 instructions # 2.60 insn per cycle + 3.441581401 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2694) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515654 -Relative difference = 3.2588039900609506e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.029039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.423785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.423785e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.173928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.603025e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.603025e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.231082 sec -INFO: No Floating Point Exceptions have been reported - 6,224,358,348 cycles # 2.777 GHz - 11,980,138,777 instructions # 1.92 insn per cycle - 2.242426635 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2468) (512y: 0) (512z: 0) +TOTAL : 2.135077 sec + 5,748,087,049 cycles # 2.686 GHz + 10,787,985,756 instructions # 1.88 insn per cycle + 2.140683671 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2293) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.044695e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.439218e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.439218e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.811256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.348034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.348034e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.224952 sec -INFO: No Floating Point Exceptions have been reported - 6,216,689,838 cycles # 2.781 GHz - 11,219,235,507 instructions # 1.80 insn per cycle - 2.236216110 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2098) (512y: 174) (512z: 0) +TOTAL : 1.912055 sec + 5,134,706,346 cycles # 2.679 GHz + 9,902,833,377 instructions # 1.93 insn per cycle + 1.917709298 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2034) (512y: 40) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.888626e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.118349e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.118349e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.596860e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.795893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.795893e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.842958 sec -INFO: No Floating Point Exceptions have been reported - 5,376,391,405 cycles # 1.885 GHz - 9,136,626,879 instructions # 1.70 insn per cycle - 2.854254782 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1632) (512y: 208) (512z: 1567) +TOTAL : 3.024884 sec + 5,382,065,012 cycles # 1.777 GHz + 6,925,116,980 instructions # 1.29 insn per cycle + 3.030667974 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 993) (512y: 67) (512z: 1602) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..943c404205 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:17:36 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +8.141613e+05 1 256 +1.655634e+06 2 256 +3.173708e+06 4 256 +6.232407e+06 8 256 +1.307682e+07 16 256 +1.882855e+07 32 256 +1.945238e+07 64 256 +2.009646e+07 128 256 +2.124544e+07 256 256 +2.154722e+07 512 256 +2.149132e+07 1024 256 +### GPU: scaling test 32 +1.045882e+05 1 32 +2.121559e+05 2 32 +4.291687e+05 4 32 +8.027620e+05 8 32 +1.602399e+06 16 32 +3.186378e+06 32 32 +6.087104e+06 64 32 +1.334324e+07 128 32 +1.760445e+07 256 32 +1.935350e+07 512 32 +1.987098e+07 1024 32 +2.043714e+07 2048 32 +2.068730e+07 4096 32 +2.072528e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.705683e+05 1 256 +1.855453e+05 2 256 +1.816065e+05 4 256 +### CPU: scaling test 32 +1.630083e+05 1 32 +1.607148e+05 2 32 +1.671437e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.067469e+05 1 256 +3.890007e+05 2 256 +4.167840e+05 4 256 +### CPU: scaling test 32 +3.472185e+05 1 32 +3.961475e+05 2 32 +3.940680e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.895313e+05 1 256 +9.759556e+05 2 256 +9.116339e+05 4 256 +### CPU: scaling test 32 +8.794592e+05 1 32 +8.459902e+05 2 32 +8.278360e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.033965e+05 1 256 +1.027122e+06 2 256 +9.561667e+05 4 256 +### CPU: scaling test 32 +9.164853e+05 1 32 +8.304139e+05 2 32 +8.973388e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.131913e+05 1 256 +7.291350e+05 2 256 +7.116716e+05 4 256 +### CPU: scaling test 32 +6.108036e+05 1 32 +5.039251e+05 2 32 +7.225230e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index eed598e900..8748bfad0d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:59:31 +DATE: 2025-09-24_07:47:03 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.348925e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.730429e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.847126e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.848748e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.153247e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.172236e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.489368 sec -INFO: No Floating Point Exceptions have been reported - 2,066,464,716 cycles # 2.888 GHz - 2,966,218,976 instructions # 1.44 insn per cycle - 0.775358949 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.538693 sec + 2,322,514,304 cycles # 2.829 GHz + 3,448,322,190 instructions # 1.48 insn per cycle + 0.878218682 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.920704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.976809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.976809e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.829942e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.880039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.880039e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.564608 sec -INFO: No Floating Point Exceptions have been reported - 16,407,008,301 cycles # 2.946 GHz - 45,390,324,197 instructions # 2.77 insn per cycle - 5.572247633 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.816771 sec + 16,702,274,132 cycles # 2.869 GHz + 46,675,030,384 instructions # 2.79 insn per cycle + 5.822310848 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 664) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.527362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.867119e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.867119e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.095384e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.377755e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.377755e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.418751 sec -INFO: No Floating Point Exceptions have been reported - 7,148,582,676 cycles # 2.947 GHz - 17,841,430,692 instructions # 2.50 insn per cycle - 2.426747092 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.648167 sec + 7,618,074,541 cycles # 2.872 GHz + 18,461,358,812 instructions # 2.42 insn per cycle + 2.653697468 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.351940e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.517580e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.517580e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.445048e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.627798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.627798e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.358362 sec -INFO: No Floating Point Exceptions have been reported - 3,812,563,399 cycles # 2.792 GHz - 8,312,155,726 instructions # 2.18 insn per cycle - 1.366469053 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.328918 sec + 3,566,458,202 cycles # 2.674 GHz + 7,631,838,956 instructions # 2.14 insn per cycle + 1.334330898 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3260) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.799220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.010674e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.010674e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.867670e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.017863e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.017863e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.295301 sec -INFO: No Floating Point Exceptions have been reported - 3,622,174,398 cycles # 2.781 GHz - 7,961,498,247 instructions # 2.20 insn per cycle - 1.303182368 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.270218 sec + 3,422,297,214 cycles # 2.684 GHz + 7,470,055,154 instructions # 2.18 insn per cycle + 1.275737279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3167) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.500324e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.161825e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.161825e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.754154e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.481927e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.481927e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.717843 sec -INFO: No Floating Point Exceptions have been reported - 3,332,199,340 cycles # 1.933 GHz - 6,146,454,565 instructions # 1.84 insn per cycle - 1.725889754 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.639571 sec + 3,053,224,009 cycles # 1.857 GHz + 5,249,590,881 instructions # 1.72 insn per cycle + 1.645301083 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2089) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 +Avg ME (F77/C++) = 2.0288183143129572 +Relative difference = 1.5492417626371624e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..9386878840 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:37:44 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.582527e+05 1 256 +5.042869e+05 2 256 +1.010578e+06 4 256 +1.791732e+06 8 256 +3.138159e+06 16 256 +5.095367e+06 32 256 +7.254867e+06 64 256 +7.872948e+06 128 256 +8.393327e+06 256 256 +8.771099e+06 512 256 +8.880832e+06 1024 256 +### GPU: scaling test 32 +3.284527e+04 1 32 +6.671051e+04 2 32 +1.301834e+05 4 32 +2.576938e+05 8 32 +5.019726e+05 16 32 +9.673016e+05 32 32 +1.768771e+06 64 32 +3.070386e+06 128 32 +5.228071e+06 256 32 +7.087152e+06 512 32 +7.969015e+06 1024 32 +8.347439e+06 2048 32 +8.663041e+06 4096 32 +8.808998e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.766619e+05 1 256 +1.809926e+05 2 256 +1.845127e+05 4 256 +### CPU: scaling test 32 +1.630432e+05 1 32 +1.614893e+05 2 32 +1.640721e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.608205e+05 1 256 +3.947164e+05 2 256 +4.206070e+05 4 256 +### CPU: scaling test 32 +3.385312e+05 1 32 +3.602610e+05 2 32 +3.674835e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.161246e+05 1 256 +9.657298e+05 2 256 +9.014997e+05 4 256 +### CPU: scaling test 32 +8.745320e+05 1 32 +9.016498e+05 2 32 +9.102870e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.628911e+05 1 256 +9.505774e+05 2 256 +9.477989e+05 4 256 +### CPU: scaling test 32 +7.395424e+05 1 32 +9.462837e+05 2 32 +9.578186e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.190607e+05 1 256 +7.431869e+05 2 256 +7.450388e+05 4 256 +### CPU: scaling test 32 +6.064167e+05 1 32 +6.801637e+05 2 32 +5.067120e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..c87245821e --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:32:36 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.578502e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.049786e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.079458e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 1.303965 sec + 4,770,461,480 cycles # 2.840 GHz + 6,757,920,419 instructions # 1.42 insn per cycle + 1.740408679 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499532034621 +Relative difference = 1.920001590188648e-05 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.833085e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.883191e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883191e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.806781 sec + 16,698,080,338 cycles # 2.874 GHz + 46,673,842,711 instructions # 2.80 insn per cycle + 5.812048935 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 664) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.100807e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.380739e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.380739e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.644410 sec + 7,612,561,317 cycles # 2.874 GHz + 18,461,919,918 instructions # 2.43 insn per cycle + 2.649591246 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.541523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.750734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.750734e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.313661 sec + 3,560,755,118 cycles # 2.702 GHz + 7,630,643,958 instructions # 2.14 insn per cycle + 1.318878863 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3260) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.877946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.018670e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.018670e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.268624 sec + 3,424,190,966 cycles # 2.689 GHz + 7,470,353,577 instructions # 2.18 insn per cycle + 1.273942336 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3167) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.776427e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.508703e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.508703e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.634117 sec + 3,049,238,738 cycles # 1.861 GHz + 5,249,598,512 instructions # 1.72 insn per cycle + 1.639464560 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2089) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183143129572 +Relative difference = 1.5492417626371624e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index ba391daf9b..a5afff3307 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,255 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:38:02 +DATE: 2025-09-24_08:55:55 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.962971e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.366502e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.366502e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.172268e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565316e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565316e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.683449 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,639,955,466 cycles # 2.881 GHz - 4,089,465,491 instructions # 1.55 insn per cycle - 0.973820402 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +TOTAL : 0.742519 sec + 2,926,334,960 cycles # 2.832 GHz + 4,662,274,955 instructions # 1.59 insn per cycle + 1.090216738 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.921187e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.975107e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.975107e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.825336e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.875215e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875215e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.590872 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 16,505,827,538 cycles # 2.949 GHz - 45,383,324,587 instructions # 2.75 insn per cycle - 5.597525299 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.875911 sec + 16,883,161,514 cycles # 2.871 GHz + 46,712,150,637 instructions # 2.77 insn per cycle + 5.882470914 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 664) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.503675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.835801e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.835801e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.059731e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.333877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.333877e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.463825 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 7,301,370,898 cycles # 2.956 GHz - 18,072,803,019 instructions # 2.48 insn per cycle - 2.471007950 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.718254 sec + 7,814,141,803 cycles # 2.871 GHz + 18,738,072,843 instructions # 2.40 insn per cycle + 2.725028734 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.228346e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.356902e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.356902e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.313202e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.453842e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.453842e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.409585 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,950,274,134 cycles # 2.790 GHz - 8,500,615,795 instructions # 2.15 insn per cycle - 1.416669722 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.400636 sec + 3,761,404,940 cycles # 2.678 GHz + 7,863,817,826 instructions # 2.09 insn per cycle + 1.409322955 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3260) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.630316e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.908478e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.908478e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.757434e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003220e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003220e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.350838 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,776,468,219 cycles # 2.783 GHz - 8,150,432,975 instructions # 2.16 insn per cycle - 1.357973048 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.333503 sec + 3,624,049,741 cycles # 2.707 GHz + 7,702,682,063 instructions # 2.13 insn per cycle + 1.340053856 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3167) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.446924e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.088794e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.088794e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.657185e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.366312e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.366312e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.766906 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,483,580,907 cycles # 1.964 GHz - 6,352,443,418 instructions # 1.82 insn per cycle - 1.774118995 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.712600 sec + 3,264,565,864 cycles # 1.900 GHz + 5,499,475,316 instructions # 1.68 insn per cycle + 1.719139859 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2089) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 +Avg ME (F77/C++) = 2.0288183143129572 +Relative difference = 1.5492417626371624e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index eaf1557b5a..f41ca6aec0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:50:12 +DATE: 2025-09-24_09:15:10 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.125576e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.707303e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.828418e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.873971e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.152674e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.172021e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.579716 sec -INFO: No Floating Point Exceptions have been reported - 2,336,853,883 cycles # 2.860 GHz - 3,355,823,518 instructions # 1.44 insn per cycle - 0.873538557 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.627107 sec + 2,578,417,587 cycles # 2.833 GHz + 3,892,656,259 instructions # 1.51 insn per cycle + 0.968128619 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.929027e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.983438e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.983438e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.831869e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.882089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.882089e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 5.578220 sec -INFO: No Floating Point Exceptions have been reported - 16,412,792,219 cycles # 2.940 GHz - 45,364,108,775 instructions # 2.76 insn per cycle - 5.583854256 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.868440 sec + 16,861,585,353 cycles # 2.871 GHz + 46,704,052,774 instructions # 2.77 insn per cycle + 5.873652139 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 664) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.528116e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.863028e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.863028e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.095423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.376391e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.376391e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.458830 sec -INFO: No Floating Point Exceptions have been reported - 7,256,357,914 cycles # 2.945 GHz - 17,803,442,746 instructions # 2.45 insn per cycle - 2.464565338 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.705119 sec + 7,776,009,723 cycles # 2.870 GHz + 18,473,510,094 instructions # 2.38 insn per cycle + 2.710275194 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.321630e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.466483e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.466483e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.568159e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.787921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.787921e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.402951 sec -INFO: No Floating Point Exceptions have been reported - 3,915,341,003 cycles # 2.781 GHz - 8,245,891,296 instructions # 2.11 insn per cycle - 1.408611815 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.367811 sec + 3,715,855,776 cycles # 2.708 GHz + 7,614,650,652 instructions # 2.05 insn per cycle + 1.373099751 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3260) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.769699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.005525e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.005525e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.949337e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.028339e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028339e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.339268 sec -INFO: No Floating Point Exceptions have been reported - 3,730,447,512 cycles # 2.775 GHz - 7,861,984,465 instructions # 2.11 insn per cycle - 1.344998375 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.317122 sec + 3,582,670,369 cycles # 2.711 GHz + 7,419,662,797 instructions # 2.07 insn per cycle + 1.322406778 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3167) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.517692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.188107e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.188107e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.768785e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.502325e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.502325e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.753383 sec -INFO: No Floating Point Exceptions have been reported - 3,445,483,739 cycles # 1.959 GHz - 6,046,658,237 instructions # 1.75 insn per cycle - 1.759146158 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.693814 sec + 3,215,725,762 cycles # 1.894 GHz + 5,199,626,463 instructions # 1.62 insn per cycle + 1.699070891 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2089) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 +Avg ME (F77/C++) = 2.0288183143129572 +Relative difference = 1.5492417626371624e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 0132142a7f..0caf85479e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:47:21 +DATE: 2025-09-24_09:11:00 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.231900e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.718618e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.843172e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.873602e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.148657e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.166610e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.520956 sec -INFO: No Floating Point Exceptions have been reported - 2,145,908,279 cycles # 2.880 GHz - 3,342,720,192 instructions # 1.56 insn per cycle - 0.802555619 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.571773 sec + 2,418,396,415 cycles # 2.829 GHz + 3,890,889,196 instructions # 1.61 insn per cycle + 0.911678328 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.929666e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.983661e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.983661e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.831816e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.881917e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.881917e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.518695 sec -INFO: No Floating Point Exceptions have been reported - 16,237,309,072 cycles # 2.940 GHz - 45,332,194,999 instructions # 2.79 insn per cycle - 5.524338903 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.810849 sec + 16,696,288,253 cycles # 2.871 GHz + 46,674,633,233 instructions # 2.80 insn per cycle + 5.816081482 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 664) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.531812e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.871745e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.871745e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.100794e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.381264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.381264e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.400101 sec -INFO: No Floating Point Exceptions have been reported - 7,092,917,063 cycles # 2.949 GHz - 17,790,950,300 instructions # 2.51 insn per cycle - 2.405895056 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.644194 sec + 7,609,575,934 cycles # 2.873 GHz + 18,461,091,440 instructions # 2.43 insn per cycle + 2.649247827 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.364764e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.520513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.520513e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.548599e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.763296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.763296e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.339869 sec -INFO: No Floating Point Exceptions have been reported - 3,746,789,760 cycles # 2.786 GHz - 8,261,610,745 instructions # 2.20 insn per cycle - 1.345882215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.313536 sec + 3,555,615,490 cycles # 2.698 GHz + 7,630,109,811 instructions # 2.15 insn per cycle + 1.318778185 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3260) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.818621e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.013746e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.013746e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.881190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.019080e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.019080e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.275913 sec -INFO: No Floating Point Exceptions have been reported - 3,561,649,230 cycles # 2.781 GHz - 7,911,264,889 instructions # 2.22 insn per cycle - 1.281614236 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.267655 sec + 3,421,150,014 cycles # 2.690 GHz + 7,468,696,042 instructions # 2.18 insn per cycle + 1.272664142 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3167) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.490214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.139560e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.139560e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.775565e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.508708e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.508708e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.701983 sec -INFO: No Floating Point Exceptions have been reported - 3,270,370,699 cycles # 1.916 GHz - 6,096,029,839 instructions # 1.86 insn per cycle - 1.707817189 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.635015 sec + 3,048,650,953 cycles # 1.860 GHz + 5,249,745,788 instructions # 1.72 insn per cycle + 1.640374800 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2089) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 +Avg ME (F77/C++) = 2.0288183143129572 +Relative difference = 1.5492417626371624e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..884be120f5 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_09:21:30 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.887556e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.151817e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.170495e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.539147 sec + 2,253,385,532 cycles # 2.821 GHz + 3,406,414,544 instructions # 1.51 insn per cycle + 0.855586365 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.834733e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.885495e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.885495e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 5.801919 sec + 16,697,530,123 cycles # 2.876 GHz + 46,673,467,044 instructions # 2.80 insn per cycle + 5.807192464 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 664) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.095899e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.376126e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.376126e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.647195 sec + 7,612,237,553 cycles # 2.871 GHz + 18,461,327,399 instructions # 2.43 insn per cycle + 2.652402692 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.573333e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.792686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.792686e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.309469 sec + 3,553,379,916 cycles # 2.705 GHz + 7,630,260,456 instructions # 2.15 insn per cycle + 1.314555585 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3260) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.934339e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.026361e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.026361e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.260503 sec + 3,420,943,817 cycles # 2.705 GHz + 7,469,256,523 instructions # 2.18 insn per cycle + 1.265599127 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3167) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.782654e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.516737e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.516737e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.632915 sec + 3,044,957,630 cycles # 1.860 GHz + 5,248,782,492 instructions # 1.72 insn per cycle + 1.638128824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2089) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183143129572 +Relative difference = 1.5492417626371624e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 55c92f68ec..759d4f650e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,238 +10,219 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:44:35 +DATE: 2025-09-24_09:07:10 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.418560e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.722658e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839243e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.205380e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.154806e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.174134e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.630221 sec -INFO: No Floating Point Exceptions have been reported - 2,475,236,721 cycles # 2.897 GHz - 3,823,734,565 instructions # 1.54 insn per cycle - 0.911361538 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +TOTAL : 0.682695 sec + 2,718,889,879 cycles # 2.833 GHz + 4,277,705,603 instructions # 1.57 insn per cycle + 1.019275785 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.933112e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.987540e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.987540e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.831310e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.881527e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.881527e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.510619 sec -INFO: No Floating Point Exceptions have been reported - 16,239,692,933 cycles # 2.945 GHz - 45,332,021,728 instructions # 2.79 insn per cycle - 5.516250908 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 591) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.813015 sec + 16,694,502,398 cycles # 2.870 GHz + 46,672,918,750 instructions # 2.80 insn per cycle + 5.818316700 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 664) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.528380e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.868469e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.868469e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.098841e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.378793e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.378793e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.402631 sec -INFO: No Floating Point Exceptions have been reported - 7,087,618,340 cycles # 2.944 GHz - 17,790,727,043 instructions # 2.51 insn per cycle - 2.408346877 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.645691 sec + 7,609,537,591 cycles # 2.871 GHz + 18,461,181,243 instructions # 2.43 insn per cycle + 2.651091423 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.367783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.536121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.536121e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.498934e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.696715e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.696715e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.339197 sec -INFO: No Floating Point Exceptions have been reported - 3,748,433,186 cycles # 2.789 GHz - 8,262,218,774 instructions # 2.20 insn per cycle - 1.344812605 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3367) (512y: 0) (512z: 0) +TOTAL : 1.319589 sec + 3,558,633,235 cycles # 2.688 GHz + 7,630,430,725 instructions # 2.14 insn per cycle + 1.324822819 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3260) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.816225e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.011910e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.011910e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.947671e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.027956e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.027956e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.274973 sec -INFO: No Floating Point Exceptions have been reported - 3,561,414,995 cycles # 2.782 GHz - 7,912,015,045 instructions # 2.22 insn per cycle - 1.280637958 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3209) (512y: 20) (512z: 0) +TOTAL : 1.258901 sec + 3,419,186,035 cycles # 2.707 GHz + 7,468,947,821 instructions # 2.18 insn per cycle + 1.264161665 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3167) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.504790e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.157762e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.157762e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.771440e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.505025e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.505025e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.699252 sec -INFO: No Floating Point Exceptions have been reported - 3,270,672,138 cycles # 1.919 GHz - 6,095,863,693 instructions # 1.86 insn per cycle - 1.704973507 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2251) (512y: 22) (512z: 2155) +TOTAL : 1.635920 sec + 3,047,206,646 cycles # 1.858 GHz + 5,249,432,639 instructions # 1.72 insn per cycle + 1.641242692 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2089) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 +Avg ME (F77/C++) = 2.0288183143129572 +Relative difference = 1.5492417626371624e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 5e80ecf473..c9756b5eaf 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:59:52 +DATE: 2025-09-24_07:47:34 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.326131e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.746336e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.856838e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.861885e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.158531e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.176528e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.493394 sec -INFO: No Floating Point Exceptions have been reported - 2,062,281,894 cycles # 2.861 GHz - 2,938,913,241 instructions # 1.43 insn per cycle - 0.784913836 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 126 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.540067 sec + 2,319,774,330 cycles # 2.822 GHz + 3,451,101,558 instructions # 1.49 insn per cycle + 0.879382497 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.953822e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.011638e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.011638e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.863519e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.915445e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.915445e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.471139 sec -INFO: No Floating Point Exceptions have been reported - 16,020,529,034 cycles # 2.925 GHz - 44,492,038,074 instructions # 2.78 insn per cycle - 5.480388445 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 536) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.714163 sec + 16,436,244,073 cycles # 2.874 GHz + 45,749,025,669 instructions # 2.78 insn per cycle + 5.719759349 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 609) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288198669441044 Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.317220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.788673e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.788673e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.094180e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.374844e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.374844e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.075008 sec -INFO: No Floating Point Exceptions have been reported - 6,135,177,420 cycles # 2.947 GHz - 17,131,917,948 instructions # 2.79 insn per cycle - 2.082995277 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2863) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.648909 sec + 7,622,554,704 cycles # 2.873 GHz + 18,456,491,216 instructions # 2.42 insn per cycle + 2.654484927 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3494) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193075684831 Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.077036e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.672972e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.672972e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.518596e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.723041e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.723041e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.827961 sec -INFO: No Floating Point Exceptions have been reported - 5,098,745,585 cycles # 2.778 GHz - 10,277,927,063 instructions # 2.02 insn per cycle - 1.836088116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3907) (512y: 0) (512z: 0) +TOTAL : 1.318146 sec + 3,562,654,657 cycles # 2.694 GHz + 7,627,422,650 instructions # 2.14 insn per cycle + 1.323687691 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.138089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.753320e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.753320e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.928783e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.025973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.025973e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.811229 sec -INFO: No Floating Point Exceptions have been reported - 5,047,478,028 cycles # 2.778 GHz - 10,048,355,032 instructions # 1.99 insn per cycle - 1.819572790 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3806) (512y: 2) (512z: 0) +TOTAL : 1.261737 sec + 3,419,342,835 cycles # 2.700 GHz + 7,466,037,637 instructions # 2.18 insn per cycle + 1.267292458 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3148) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181869545951 Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.690006e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.022722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.022722e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.769269e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.500411e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.500411e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.339710 sec -INFO: No Floating Point Exceptions have been reported - 4,430,484,038 cycles # 1.888 GHz - 8,494,687,635 instructions # 1.92 insn per cycle - 2.347901015 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2746) (512y: 4) (512z: 2754) +TOTAL : 1.636941 sec + 3,055,793,005 cycles # 1.861 GHz + 5,247,434,771 instructions # 1.72 insn per cycle + 1.642577316 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2056) (512y: 5) (512z: 2089) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183148950338 -Relative difference = 1.5521108056421764e-07 +Avg ME (F77/C++) = 2.0288183143129572 +Relative difference = 1.5492417626371624e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 8666f655aa..c154d25891 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:29:20 +DATE: 2025-09-24_08:47:00 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.502979e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.757241e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.878370e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.866634e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.147053e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.165837e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.488684 sec -INFO: No Floating Point Exceptions have been reported - 2,072,092,086 cycles # 2.888 GHz - 2,980,809,123 instructions # 1.44 insn per cycle - 0.774128701 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.538880 sec + 2,314,341,368 cycles # 2.820 GHz + 3,436,565,941 instructions # 1.48 insn per cycle + 0.878082792 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.497944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.591831e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.591831e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.008534e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.069275e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.069275e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.301034 sec -INFO: No Floating Point Exceptions have been reported - 12,652,758,977 cycles # 2.937 GHz - 34,660,886,060 instructions # 2.74 insn per cycle - 4.309086604 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 683) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.307798 sec + 15,234,969,835 cycles # 2.868 GHz + 42,317,180,883 instructions # 2.78 insn per cycle + 5.313067205 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 720) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288198612397537 +Relative difference = 6.839455762672188e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.170038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.622090e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.622090e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.541314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.891142e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.891142e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.133248 sec -INFO: No Floating Point Exceptions have been reported - 6,307,478,134 cycles # 2.947 GHz - 14,873,781,997 instructions # 2.36 insn per cycle - 2.140857047 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2975) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.396788 sec + 6,875,094,670 cycles # 2.863 GHz + 16,341,506,518 instructions # 2.38 insn per cycle + 2.402179834 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3288) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193755550310 -Relative difference = 1.8511017053446366e-07 +Avg ME (F77/C++) = 2.0288193161832169 +Relative difference = 1.5584594630759239e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.248492e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.104502e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.104502e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.990295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.036634e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036634e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.548692 sec -INFO: No Floating Point Exceptions have been reported - 4,331,332,767 cycles # 2.784 GHz - 9,119,017,787 instructions # 2.11 insn per cycle - 1.556682967 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4456) (512y: 0) (512z: 0) +TOTAL : 1.253414 sec + 3,371,756,397 cycles # 2.681 GHz + 6,839,443,832 instructions # 2.03 insn per cycle + 1.258730374 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3139) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182069780305 -Relative difference = 1.0201902325125583e-07 +Avg ME (F77/C++) = 2.0288181848591194 +Relative difference = 9.111665983220217e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.353371e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.251881e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.251881e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.825152e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.146044e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.146044e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.529166 sec -INFO: No Floating Point Exceptions have been reported - 4,288,032,705 cycles # 2.791 GHz - 8,709,611,506 instructions # 2.03 insn per cycle - 1.537124060 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4233) (512y: 0) (512z: 0) +TOTAL : 1.154321 sec + 3,123,268,751 cycles # 2.696 GHz + 6,388,429,640 instructions # 2.05 insn per cycle + 1.159462810 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2886) (512y: 7) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182069780305 -Relative difference = 1.0201902325125583e-07 +Avg ME (F77/C++) = 2.0288181848591194 +Relative difference = 9.111665983220217e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.411255e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.862053e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.862053e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.255889e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.103086e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.103086e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.041395 sec -INFO: No Floating Point Exceptions have been reported - 3,904,121,018 cycles # 1.906 GHz - 7,856,412,999 instructions # 2.01 insn per cycle - 2.049301951 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4273) (512y: 0) (512z: 2558) +TOTAL : 1.532528 sec + 2,868,652,585 cycles # 1.867 GHz + 4,790,852,156 instructions # 1.67 insn per cycle + 1.537786537 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1862) (512y: 5) (512z: 1976) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183246739209 -Relative difference = 1.6003107281264138e-07 +Avg ME (F77/C++) = 2.0288183073280379 +Relative difference = 1.514813246576993e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 74b1cf75ec..58c504e753 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:29:40 +DATE: 2025-09-24_08:47:30 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.573239e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.755917e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.881516e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.874108e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.155948e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.175666e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.487451 sec -INFO: No Floating Point Exceptions have been reported - 2,067,657,057 cycles # 2.894 GHz - 2,969,147,079 instructions # 1.44 insn per cycle - 0.771604792 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 126 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.548499 sec + 2,349,082,705 cycles # 2.826 GHz + 3,444,335,519 instructions # 1.47 insn per cycle + 0.888498013 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499356247485 -Relative difference = 1.9191351362116207e-05 +Avg ME (F77/GPU) = 2.0288499490125105 +Relative difference = 1.9197950183795553e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.674902e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781976e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781976e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.004031e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.064190e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.064190e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.022349 sec -INFO: No Floating Point Exceptions have been reported - 11,884,847,246 cycles # 2.950 GHz - 35,128,022,846 instructions # 2.96 insn per cycle - 4.030241157 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 453) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.318944 sec + 15,280,578,599 cycles # 2.871 GHz + 42,518,914,092 instructions # 2.78 insn per cycle + 5.324125018 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 642) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288198620546609 +Relative difference = 6.799289200198014e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.473588e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.982990e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.982990e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.572000e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.924083e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.924083e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.018275 sec -INFO: No Floating Point Exceptions have been reported - 5,977,087,994 cycles # 2.951 GHz - 14,582,659,278 instructions # 2.44 insn per cycle - 2.026172081 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2569) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.380861 sec + 6,844,477,071 cycles # 2.870 GHz + 16,273,997,793 instructions # 2.38 insn per cycle + 2.386295475 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3006) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193583255634 -Relative difference = 1.7661780742548925e-07 +Avg ME (F77/C++) = 2.0288193161832169 +Relative difference = 1.5584594630759239e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.377553e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.279187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.279187e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.075588e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.046375e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.046375e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.524553 sec -INFO: No Floating Point Exceptions have been reported - 4,234,763,555 cycles # 2.764 GHz - 8,897,798,804 instructions # 2.10 insn per cycle - 1.532761317 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3552) (512y: 0) (512z: 0) +TOTAL : 1.242091 sec + 3,350,889,709 cycles # 2.689 GHz + 6,722,008,368 instructions # 2.01 insn per cycle + 1.247472305 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2661) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288181760115549 +Relative difference = 8.67557144645807e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.495273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.420338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.420338e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.976555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.166609e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.166609e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.502506 sec -INFO: No Floating Point Exceptions have been reported - 4,214,392,060 cycles # 2.792 GHz - 8,461,762,117 instructions # 2.01 insn per cycle - 1.510417354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3296) (512y: 0) (512z: 0) +TOTAL : 1.138185 sec + 3,060,302,865 cycles # 2.678 GHz + 6,320,589,737 instructions # 2.07 insn per cycle + 1.143708310 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2450) (512y: 7) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288181760115549 +Relative difference = 8.67557144645807e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.487070e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.949626e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.949626e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.207768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.045909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.045909e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.014420 sec -INFO: No Floating Point Exceptions have been reported - 3,856,759,695 cycles # 1.908 GHz - 7,749,847,516 instructions # 2.01 insn per cycle - 2.022398856 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3289) (512y: 0) (512z: 2110) +TOTAL : 1.542518 sec + 2,878,874,944 cycles # 1.861 GHz + 4,765,123,226 instructions # 1.66 insn per cycle + 1.547749585 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1355) (512y: 22) (512z: 1745) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183204829693 -Relative difference = 1.5796536184903122e-07 +Avg ME (F77/C++) = 2.0288183073280379 +Relative difference = 1.514813246576993e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..8e04255902 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:17:15 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +8.034851e+05 1 256 +1.570032e+06 2 256 +3.267619e+06 4 256 +6.473638e+06 8 256 +9.903743e+06 16 256 +1.026330e+07 32 256 +9.922907e+06 64 256 +1.019578e+07 128 256 +1.041824e+07 256 256 +1.041387e+07 512 256 +1.043869e+07 1024 256 +### GPU: scaling test 32 +1.089447e+05 1 32 +2.100696e+05 2 32 +4.218867e+05 4 32 +8.222443e+05 8 32 +1.614383e+06 16 32 +3.233783e+06 32 32 +6.305788e+06 64 32 +1.005593e+07 128 32 +1.031516e+07 256 32 +9.639949e+06 512 32 +1.002191e+07 1024 32 +1.025487e+07 2048 32 +1.030770e+07 4096 32 +1.032661e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.651184e+05 1 256 +1.714247e+05 2 256 +1.762711e+05 4 256 +### CPU: scaling test 32 +1.670730e+05 1 32 +1.646840e+05 2 32 +1.678567e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.882857e+05 1 256 +2.987691e+05 2 256 +2.934247e+05 4 256 +### CPU: scaling test 32 +2.791152e+05 1 32 +2.799466e+05 2 32 +2.522058e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.293894e+05 1 256 +4.911285e+05 2 256 +5.301102e+05 4 256 +### CPU: scaling test 32 +5.227648e+05 1 32 +4.919785e+05 2 32 +5.143165e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.210348e+05 1 256 +5.219334e+05 2 256 +5.165907e+05 4 256 +### CPU: scaling test 32 +5.552375e+05 1 32 +5.620593e+05 2 32 +5.696104e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.434605e+05 1 256 +3.468478e+05 2 256 +3.526863e+05 4 256 +### CPU: scaling test 32 +3.476674e+05 1 32 +3.530762e+05 2 32 +3.452219e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 46bc87b45e..9512e6842c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:58:41 +DATE: 2025-09-24_07:45:51 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.456560e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.379988e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000705e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.420550e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.051392e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058569e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534501 sec -INFO: No Floating Point Exceptions have been reported - 2,219,584,721 cycles # 2.878 GHz - 3,138,987,562 instructions # 1.41 insn per cycle - 0.829330920 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.624089 sec + 2,740,298,329 cycles # 2.837 GHz + 4,376,999,754 instructions # 1.60 insn per cycle + 1.027657731 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/GPU) = 2.0288063943199761 +Relative difference = 2.9853999119330943e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.813220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.859845e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.859845e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.739456e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782512e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.782512e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.928691 sec -INFO: No Floating Point Exceptions have been reported - 17,514,594,449 cycles # 2.949 GHz - 46,201,641,620 instructions # 2.64 insn per cycle - 5.940965337 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 617) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.137137 sec + 17,632,069,705 cycles # 2.871 GHz + 47,071,786,706 instructions # 2.67 insn per cycle + 6.142721284 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.229159e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.395479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.395479e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.969738e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.110688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.110688e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.394540 sec -INFO: No Floating Point Exceptions have been reported - 10,052,901,757 cycles # 2.953 GHz - 27,702,324,481 instructions # 2.76 insn per cycle - 3.406321535 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.640304 sec + 10,461,431,755 cycles # 2.870 GHz + 28,885,201,179 instructions # 2.76 insn per cycle + 3.646099818 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2958) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.062332e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.465524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.465524e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.866190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.240249e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.240249e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.217135 sec -INFO: No Floating Point Exceptions have been reported - 6,171,509,914 cycles # 2.770 GHz - 12,603,170,569 instructions # 2.04 insn per cycle - 2.229995554 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2773) (512y: 0) (512z: 0) +TOTAL : 2.263992 sec + 6,074,776,768 cycles # 2.677 GHz + 12,331,537,932 instructions # 2.03 insn per cycle + 2.269808090 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.580384e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.068896e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.068896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.181720e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.599871e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.599871e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.025421 sec -INFO: No Floating Point Exceptions have been reported - 5,651,741,681 cycles # 2.776 GHz - 12,038,443,177 instructions # 2.13 insn per cycle - 2.038138408 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2518) (512y: 146) (512z: 0) +TOTAL : 2.132428 sec + 5,740,054,892 cycles # 2.685 GHz + 11,983,118,985 instructions # 2.09 insn per cycle + 2.138292263 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2710) (512y: 49) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.630973e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.831034e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.831034e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.349584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.520024e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520024e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.034632 sec -INFO: No Floating Point Exceptions have been reported - 5,740,712,408 cycles # 1.885 GHz - 8,225,599,297 instructions # 1.43 insn per cycle - 3.047056631 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1671) (512y: 126) (512z: 1862) +TOTAL : 3.239992 sec + 5,775,149,982 cycles # 1.780 GHz + 7,815,004,616 instructions # 1.35 insn per cycle + 3.245774275 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1669) (512y: 69) (512z: 1931) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..238832770b --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:37:03 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.361105e+05 1 256 +4.555334e+05 2 256 +8.473441e+05 4 256 +1.550614e+06 8 256 +2.631040e+06 16 256 +4.137873e+06 32 256 +4.813875e+06 64 256 +5.487026e+06 128 256 +5.764957e+06 256 256 +5.992209e+06 512 256 +6.106508e+06 1024 256 +### GPU: scaling test 32 +2.958875e+04 1 32 +6.085178e+04 2 32 +1.192154e+05 4 32 +2.314980e+05 8 32 +4.464807e+05 16 32 +8.917880e+05 32 32 +1.537449e+06 64 32 +2.798692e+06 128 32 +3.996433e+06 256 32 +4.779533e+06 512 32 +5.364461e+06 1024 32 +5.738288e+06 2048 32 +5.946428e+06 4096 32 +6.037437e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.685006e+05 1 256 +1.723354e+05 2 256 +1.638859e+05 4 256 +### CPU: scaling test 32 +1.369740e+05 1 32 +1.491240e+05 2 32 +1.579015e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.727240e+05 1 256 +3.013403e+05 2 256 +2.999575e+05 4 256 +### CPU: scaling test 32 +2.693875e+05 1 32 +2.195495e+05 2 32 +2.692487e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.889061e+05 1 256 +4.863458e+05 2 256 +5.265835e+05 4 256 +### CPU: scaling test 32 +4.856285e+05 1 32 +5.290699e+05 2 32 +5.203400e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.207613e+05 1 256 +5.180923e+05 2 256 +5.299914e+05 4 256 +### CPU: scaling test 32 +5.124756e+05 1 32 +5.235731e+05 2 32 +5.103201e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.408412e+05 1 256 +3.500331e+05 2 256 +3.527965e+05 4 256 +### CPU: scaling test 32 +3.110934e+05 1 32 +2.928378e+05 2 32 +3.461602e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..a87f7a8546 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_08:31:51 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.870978e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.293708e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.319514e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.390916 sec + 5,054,727,678 cycles # 2.837 GHz + 7,164,050,823 instructions # 1.42 insn per cycle + 1.855053661 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028807e+00 +Avg ME (F77/GPU) = 2.0288064033535846 +Relative difference = 2.940873209649997e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.739239e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782320e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.782320e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.137107 sec + 17,636,122,301 cycles # 2.872 GHz + 47,073,973,330 instructions # 2.67 insn per cycle + 6.142795081 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.974256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.114750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.114750e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.634370 sec + 10,446,510,479 cycles # 2.871 GHz + 28,885,820,804 instructions # 2.77 insn per cycle + 3.640145208 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2958) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.880489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.259855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.259855e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.257451 sec + 6,073,853,734 cycles # 2.685 GHz + 12,332,121,068 instructions # 2.03 insn per cycle + 2.263129601 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2878) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.165320e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.583342e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.583342e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.138471 sec + 5,745,518,950 cycles # 2.681 GHz + 11,981,716,362 instructions # 2.09 insn per cycle + 2.143992444 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2710) (512y: 49) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.356813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.527339e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.527339e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.232530 sec + 5,761,140,182 cycles # 1.780 GHz + 7,815,145,650 instructions # 1.36 insn per cycle + 3.238204918 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1669) (512y: 69) (512z: 1931) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..b5b337f1e5 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + +DATE: 2025-09-24_09:20:54 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.264552e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.050944e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059373e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.628494 sec + 2,717,437,197 cycles # 2.834 GHz + 4,420,610,573 instructions # 1.63 insn per cycle + 1.016415872 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028807e+00 +Avg ME (F77/GPU) = 2.0288063943199761 +Relative difference = 2.9853999119330943e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.741970e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.785210e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.785210e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 6.127535 sec + 17,619,918,709 cycles # 2.874 GHz + 47,072,142,698 instructions # 2.67 insn per cycle + 6.132916216 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 690) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 2.973024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.113129e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.113129e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.635704 sec + 10,451,978,137 cycles # 2.872 GHz + 28,884,800,063 instructions # 2.76 insn per cycle + 3.641136554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2958) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063903750300 +Relative difference = 3.0048445715164216e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.893038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.274118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.274118e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.252307 sec + 6,067,632,575 cycles # 2.688 GHz + 12,331,256,225 instructions # 2.03 insn per cycle + 2.257676783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2878) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 5.165992e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.582198e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.582198e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.138263 sec + 5,738,986,912 cycles # 2.678 GHz + 11,983,127,376 instructions # 2.09 insn per cycle + 2.143816952 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2710) (512y: 49) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.358397e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.528203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528203e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.229987 sec + 5,756,957,930 cycles # 1.780 GHz + 7,814,408,912 instructions # 1.36 insn per cycle + 3.235396193 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1669) (512y: 69) (512z: 1931) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index ffa5410982..82fc0394a2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_08:59:06 +DATE: 2025-09-24_07:46:30 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.422071e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.351796e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.985674e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.382296e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.046566e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053685e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.535641 sec -INFO: No Floating Point Exceptions have been reported - 2,214,747,611 cycles # 2.879 GHz - 3,172,033,471 instructions # 1.43 insn per cycle - 0.829540839 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.623847 sec + 2,768,474,538 cycles # 2.832 GHz + 4,429,449,797 instructions # 1.60 insn per cycle + 1.035793687 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 98 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/GPU) = 2.0288063943199761 +Relative difference = 2.9853999119330943e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.862163e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.911340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.911340e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.795413e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.841835e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.841835e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.777703 sec -INFO: No Floating Point Exceptions have been reported - 17,097,861,095 cycles # 2.954 GHz - 45,230,787,591 instructions # 2.65 insn per cycle - 5.789414615 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.947704 sec + 17,088,961,971 cycles # 2.871 GHz + 45,931,713,417 instructions # 2.69 insn per cycle + 5.953595197 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 633) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.356972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.536408e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.536408e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.986501e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.128158e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.128158e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.270231 sec -INFO: No Floating Point Exceptions have been reported - 9,665,855,757 cycles # 2.946 GHz - 26,370,377,514 instructions # 2.73 insn per cycle - 3.281726897 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2386) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.619942 sec + 10,404,246,476 cycles # 2.871 GHz + 28,876,204,491 instructions # 2.78 insn per cycle + 3.625766762 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2953) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.515319e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.832036e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.832036e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.863046e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.235612e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.235612e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.470441 sec -INFO: No Floating Point Exceptions have been reported - 6,884,599,220 cycles # 2.774 GHz - 14,150,233,239 instructions # 2.06 insn per cycle - 2.482504065 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2896) (512y: 0) (512z: 0) +TOTAL : 2.265397 sec + 6,071,670,861 cycles # 2.675 GHz + 12,327,353,870 instructions # 2.03 insn per cycle + 2.271397009 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2866) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.744762e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.096792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.096792e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.132632e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.541433e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.541433e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.356796 sec -INFO: No Floating Point Exceptions have been reported - 6,551,408,744 cycles # 2.767 GHz - 13,642,717,150 instructions # 2.08 insn per cycle - 2.368190066 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2535) (512y: 302) (512z: 0) +TOTAL : 2.149810 sec + 5,735,457,307 cycles # 2.663 GHz + 11,976,172,204 instructions # 2.09 insn per cycle + 2.154912250 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2696) (512y: 49) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.568399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.763148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.763148e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.326526e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493190e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.493190e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.086761 sec -INFO: No Floating Point Exceptions have been reported - 5,741,113,391 cycles # 1.854 GHz - 9,326,512,235 instructions # 1.62 insn per cycle - 3.098253222 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1456) (512y: 212) (512z: 2060) +TOTAL : 3.259408 sec + 5,774,724,967 cycles # 1.770 GHz + 7,811,134,891 instructions # 1.35 insn per cycle + 3.264533598 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 69) (512z: 1931) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..0430c29866 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-09-24_08:17:56 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.170210e+05 1 256 +2.125984e+05 2 256 +3.959312e+05 4 256 +7.383690e+05 8 256 +1.256510e+06 16 256 +1.302525e+06 32 256 +1.137351e+06 64 256 +1.148455e+06 128 256 +1.155387e+06 256 256 +1.176497e+06 512 256 +1.174053e+06 1024 256 +### GPU: scaling test 32 +1.722763e+04 1 32 +3.648322e+04 2 32 +6.851189e+04 4 32 +1.336462e+05 8 32 +2.554053e+05 16 32 +4.599070e+05 32 32 +8.068325e+05 64 32 +1.288259e+06 128 32 +1.347518e+06 256 32 +1.114318e+06 512 32 +1.117513e+06 1024 32 +1.106788e+06 2048 32 +1.132284e+06 4096 32 +1.131009e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.295968e+04 1 256 +2.304103e+04 2 256 +2.327288e+04 4 256 +### CPU: scaling test 32 +2.147053e+04 1 32 +2.182408e+04 2 32 +2.202495e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.224019e+04 1 256 +4.240941e+04 2 256 +4.252698e+04 4 256 +### CPU: scaling test 32 +3.980070e+04 1 32 +3.623199e+04 2 32 +4.108926e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.181243e+04 1 256 +8.010987e+04 2 256 +8.447301e+04 4 256 +### CPU: scaling test 32 +8.608051e+04 1 32 +8.766103e+04 2 32 +7.450255e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.863253e+04 1 256 +8.758686e+04 2 256 +9.198692e+04 4 256 +### CPU: scaling test 32 +9.391931e+04 1 32 +8.586478e+04 2 32 +8.665993e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.072923e+04 1 256 +6.991165e+04 2 256 +7.056341e+04 4 256 +### CPU: scaling test 32 +7.025617e+04 1 32 +6.971883e+04 2 32 +6.550582e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 028292e268..c992d10b68 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:00:14 +DATE: 2025-09-24_07:48:07 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.612194e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.849217e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.964394e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.112133e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140622e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.142508e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.477846 sec -INFO: No Floating Point Exceptions have been reported - 1,998,983,760 cycles # 2.871 GHz - 2,812,176,587 instructions # 1.41 insn per cycle - 0.759674168 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.605411 sec + 2,421,511,234 cycles # 2.827 GHz + 3,594,444,056 instructions # 1.48 insn per cycle + 0.913457815 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.042987e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.232338e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242858e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.621402 sec -INFO: No Floating Point Exceptions have been reported - 2,510,286,495 cycles # 2.883 GHz - 3,752,986,245 instructions # 1.50 insn per cycle - 0.931747637 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 104 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.434605e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.446812e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.446812e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.312222e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.323523e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.323523e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.752117 sec -INFO: No Floating Point Exceptions have been reported - 19,916,103,310 cycles # 2.949 GHz - 59,916,518,373 instructions # 3.01 insn per cycle - 6.756066066 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.107355 sec + 20,283,539,678 cycles # 2.853 GHz + 59,993,146,243 instructions # 2.96 insn per cycle + 7.111396668 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.568526e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.611480e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.611480e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.232536e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.270759e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.270759e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.606956 sec -INFO: No Floating Point Exceptions have been reported - 10,571,212,167 cycles # 2.928 GHz - 31,086,653,440 instructions # 2.94 insn per cycle - 3.611892241 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.891383 sec + 11,201,770,072 cycles # 2.876 GHz + 32,050,630,389 instructions # 2.86 insn per cycle + 3.895464161 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5777) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432435 +Relative difference = 4.4692302324034146e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.091675e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.256165e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.256165e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.513064e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.666589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.666589e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.823998 sec -INFO: No Floating Point Exceptions have been reported - 4,999,238,647 cycles # 2.738 GHz - 11,406,827,724 instructions # 2.28 insn per cycle - 1.827985092 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4642) (512y: 0) (512z: 0) +TOTAL : 1.945463 sec + 5,194,923,693 cycles # 2.666 GHz + 11,932,633,899 instructions # 2.30 insn per cycle + 1.949544641 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4713) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.026950e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047965e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.047965e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.375739e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.559826e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.559826e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.617207 sec -INFO: No Floating Point Exceptions have been reported - 4,447,500,259 cycles # 2.747 GHz - 10,665,398,274 instructions # 2.40 insn per cycle - 1.621167175 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4378) (512y: 92) (512z: 0) +TOTAL : 1.768434 sec + 4,741,825,240 cycles # 2.676 GHz + 11,283,684,571 instructions # 2.38 insn per cycle + 1.772501235 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4515) (512y: 55) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.168386e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.273905e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.273905e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.970736e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.074987e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.074987e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.309115 sec -INFO: No Floating Point Exceptions have been reported - 4,128,751,307 cycles # 1.785 GHz - 5,972,449,468 instructions # 1.45 insn per cycle - 2.314144205 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1620) (512y: 94) (512z: 3577) +TOTAL : 2.373564 sec + 4,058,654,330 cycles # 1.708 GHz + 5,985,363,778 instructions # 1.47 insn per cycle + 2.377708670 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1486) (512y: 61) (512z: 4162) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416482 +Relative difference = 4.4692415222317974e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 76636470b0..ad07f72fbe 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,275 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:38:23 +DATE: 2025-09-24_08:56:23 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.472313e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.180220e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.180220e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.524676e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096044e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096044e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.504857 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,064,539,289 cycles # 2.862 GHz - 3,123,566,672 instructions # 1.51 insn per cycle - 0.778239097 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.635613 sec + 2,494,153,441 cycles # 2.830 GHz + 3,899,501,791 instructions # 1.56 insn per cycle + 0.938261269 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +==PROF== Profiling "diagram1": launch__registers_per_thread 104 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.683325e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.341961e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.341961e+06 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.833212 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,141,452,514 cycles # 2.889 GHz - 4,965,295,428 instructions # 1.58 insn per cycle - 1.145190233 seconds time elapsed +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.439308e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.451643e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.451643e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.329451e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.340969e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.340969e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.745227 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 19,922,326,116 cycles # 2.952 GHz - 59,921,657,661 instructions # 3.01 insn per cycle - 6.749767217 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.063608 sec + 20,321,926,121 cycles # 2.876 GHz + 59,999,205,151 instructions # 2.95 insn per cycle + 7.068112656 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.590762e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.634359e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.634359e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.221277e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.259857e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.259857e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.596308 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,606,556,243 cycles # 2.946 GHz - 31,132,640,347 instructions # 2.94 insn per cycle - 3.600784290 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.910772 sec + 11,246,346,443 cycles # 2.873 GHz + 32,099,028,986 instructions # 2.85 insn per cycle + 3.915133075 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5777) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432435 +Relative difference = 4.4692302324034146e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.045361e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.212711e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.212711e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.480438e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.636114e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.636114e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.840181 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 5,054,435,549 cycles # 2.741 GHz - 11,457,891,523 instructions # 2.27 insn per cycle - 1.844724432 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4642) (512y: 0) (512z: 0) +TOTAL : 1.962684 sec + 5,238,103,974 cycles # 2.664 GHz + 11,982,359,006 instructions # 2.29 insn per cycle + 1.967119077 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4713) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.028589e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049854e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049854e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.303300e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.490816e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.490816e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.621206 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,484,828,096 cycles # 2.760 GHz - 10,715,944,638 instructions # 2.39 insn per cycle - 1.625802151 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4378) (512y: 92) (512z: 0) +TOTAL : 1.791254 sec + 4,799,685,272 cycles # 2.674 GHz + 11,333,725,028 instructions # 2.36 insn per cycle + 1.795648986 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4515) (512y: 55) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.165257e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.268564e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.268564e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.945215e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.049356e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.049356e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.316443 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,162,925,866 cycles # 1.795 GHz - 6,008,954,577 instructions # 1.44 insn per cycle - 2.321140123 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1620) (512y: 94) (512z: 3577) +TOTAL : 2.391856 sec + 4,105,469,145 cycles # 1.714 GHz + 6,023,724,588 instructions # 1.47 insn per cycle + 2.396293037 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1486) (512y: 61) (512z: 4162) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416482 +Relative difference = 4.4692415222317974e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 49402063e2..d309d70a53 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:00:40 +DATE: 2025-09-24_07:48:50 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.575064e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.921304e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.028957e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.089398e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.117332e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.119247e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.477703 sec -INFO: No Floating Point Exceptions have been reported - 1,994,590,518 cycles # 2.865 GHz - 2,848,992,929 instructions # 1.43 insn per cycle - 0.754407053 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.610175 sec + 2,428,893,625 cycles # 2.825 GHz + 3,658,657,722 instructions # 1.51 insn per cycle + 0.917445904 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.042325e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.231825e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242712e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.616679 sec -INFO: No Floating Point Exceptions have been reported - 2,463,746,118 cycles # 2.874 GHz - 3,716,874,386 instructions # 1.51 insn per cycle - 0.917442132 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 100 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/GPU) = 1.4131213684418646 +Relative difference = 4.4692399902091566e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.437110e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.449363e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.449363e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.346165e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.357743e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.357743e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.744189 sec -INFO: No Floating Point Exceptions have been reported - 19,899,963,729 cycles # 2.950 GHz - 60,130,622,589 instructions # 3.02 insn per cycle - 6.748077481 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1322) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.004658 sec + 20,124,920,564 cycles # 2.872 GHz + 59,756,165,398 instructions # 2.97 insn per cycle + 7.008608396 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1376) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.632122e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.676125e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.676125e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.250499e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.289052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.289052e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.557509 sec -INFO: No Floating Point Exceptions have been reported - 10,482,296,489 cycles # 2.944 GHz - 30,686,942,862 instructions # 2.93 insn per cycle - 3.561419011 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5047) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.874731 sec + 11,138,454,129 cycles # 2.872 GHz + 32,036,455,579 instructions # 2.88 insn per cycle + 3.878707170 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5772) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432435 +Relative difference = 4.4692302324034146e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.842314e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.999775e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.999775e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.469803e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.622290e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.622290e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.874706 sec -INFO: No Floating Point Exceptions have been reported - 5,138,957,277 cycles # 2.738 GHz - 11,840,408,683 instructions # 2.30 insn per cycle - 1.878700358 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4746) (512y: 0) (512z: 0) +TOTAL : 1.955145 sec + 5,212,436,941 cycles # 2.662 GHz + 11,925,216,027 instructions # 2.29 insn per cycle + 1.959313231 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.602387e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.789550e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.789550e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.342400e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.527508e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.527508e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.727070 sec -INFO: No Floating Point Exceptions have been reported - 4,726,480,466 cycles # 2.731 GHz - 11,165,052,550 instructions # 2.36 insn per cycle - 1.731070886 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4403) (512y: 246) (512z: 0) +TOTAL : 1.774348 sec + 4,749,629,736 cycles # 2.673 GHz + 11,276,525,204 instructions # 2.37 insn per cycle + 1.778345112 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4495) (512y: 55) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416466 Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.101185e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.203049e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.203049e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.953876e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.057374e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.057374e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.329881 sec -INFO: No Floating Point Exceptions have been reported - 4,155,200,887 cycles # 1.781 GHz - 6,223,800,996 instructions # 1.50 insn per cycle - 2.334090572 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1516) (512y: 139) (512z: 3679) +TOTAL : 2.378867 sec + 4,058,382,636 cycles # 1.704 GHz + 5,981,306,817 instructions # 1.47 insn per cycle + 2.382982664 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1466) (512y: 61) (512z: 4162) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416482 +Relative difference = 4.4692415222317974e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..7b10b5caca --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-09-24_08:18:42 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.269366e+05 1 256 +2.583265e+05 2 256 +4.800203e+05 4 256 +8.615029e+05 8 256 +1.454960e+06 16 256 +2.424968e+06 32 256 +2.687996e+06 64 256 +2.634524e+06 128 256 +2.688194e+06 256 256 +2.801637e+06 512 256 +2.875295e+06 1024 256 +### GPU: scaling test 32 +1.788863e+04 1 32 +3.678728e+04 2 32 +7.099492e+04 4 32 +1.431307e+05 8 32 +2.693658e+05 16 32 +5.634733e+05 32 32 +8.933206e+05 64 32 +1.596539e+06 128 32 +2.441699e+06 256 32 +2.725349e+06 512 32 +2.570366e+06 1024 32 +2.561991e+06 2048 32 +2.633165e+06 4096 32 +2.693494e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.398985e+04 1 256 +2.291356e+04 2 256 +2.398329e+04 4 256 +### CPU: scaling test 32 +2.294231e+04 1 32 +2.313364e+04 2 32 +2.339922e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.662482e+04 1 256 +7.711438e+04 2 256 +7.766647e+04 4 256 +### CPU: scaling test 32 +6.733199e+04 1 32 +6.569068e+04 2 32 +7.396390e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.564092e+05 1 256 +1.540938e+05 2 256 +1.563867e+05 4 256 +### CPU: scaling test 32 +1.695939e+05 1 32 +1.711134e+05 2 32 +1.550985e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.839907e+05 1 256 +1.684652e+05 2 256 +1.765554e+05 4 256 +### CPU: scaling test 32 +1.723748e+05 1 32 +1.697352e+05 2 32 +1.712223e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.432731e+05 1 256 +1.455285e+05 2 256 +1.430022e+05 4 256 +### CPU: scaling test 32 +1.408587e+05 1 32 +1.474549e+05 2 32 +1.416650e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index b4d9344f80..c3559c41cb 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:01:57 +DATE: 2025-09-24_07:50:39 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.641235e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.015793e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.057654e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.614171e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.701563e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.709156e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.457409 sec -INFO: No Floating Point Exceptions have been reported - 1,937,244,275 cycles # 2.867 GHz - 2,710,892,637 instructions # 1.40 insn per cycle - 0.733854811 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.520748 sec + 2,165,328,196 cycles # 2.822 GHz + 3,063,009,682 instructions # 1.41 insn per cycle + 0.829100323 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.672412e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.384843e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.427387e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.509900 sec -INFO: No Floating Point Exceptions have been reported - 2,162,696,786 cycles # 2.871 GHz - 3,100,226,347 instructions # 1.43 insn per cycle - 0.811215095 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 70 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214305330990 -Relative difference = 0.0004349621183379836 +Avg ME (F77/GPU) = 1.4132215053590471 +Relative difference = 0.0004350150884479323 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.513642e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.526564e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.526564e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.537186 sec -INFO: No Floating Point Exceptions have been reported - 19,278,711,706 cycles # 2.948 GHz - 59,616,757,005 instructions # 3.09 insn per cycle - 6.541004954 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 959) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.396916e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.409518e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.409518e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002644e+01 ) GeV^-2 +TOTAL : 6.855048 sec + 19,662,700,355 cycles # 2.867 GHz + 60,454,183,722 instructions # 3.07 insn per cycle + 6.859049729 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1323) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129950871652284 +Relative difference = 6.168827799708488e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.120315e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.259615e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.259615e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.735130e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.868621e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.868621e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.035209 sec -INFO: No Floating Point Exceptions have been reported - 6,010,527,138 cycles # 2.949 GHz - 17,061,942,080 instructions # 2.84 insn per cycle - 2.038918474 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5856) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.136971 sec + 6,147,248,224 cycles # 2.873 GHz + 17,502,229,798 instructions # 2.85 insn per cycle + 2.140918572 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6412) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954647353316 -Relative difference = 3.2890090308261873e-07 +Avg ME (F77/C++) = 1.4129953611754331 +Relative difference = 2.5560984512808326e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.748972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.811746e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.811746e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.633695e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.690243e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.690243e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.954915 sec -INFO: No Floating Point Exceptions have been reported - 2,640,169,352 cycles # 2.756 GHz - 6,187,458,591 instructions # 2.34 insn per cycle - 0.958678404 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5105) (512y: 0) (512z: 0) +TOTAL : 1.021676 sec + 2,743,963,979 cycles # 2.678 GHz + 6,444,168,368 instructions # 2.35 insn per cycle + 1.025607032 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5176) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133133105791558 +Relative difference = 2.197525641713777e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.923079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.998771e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.998771e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.786119e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.853837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.853837e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.870044 sec -INFO: No Floating Point Exceptions have been reported - 2,402,321,989 cycles # 2.751 GHz - 5,790,080,813 instructions # 2.41 insn per cycle - 0.873863245 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4906) (512y: 37) (512z: 0) +TOTAL : 0.935888 sec + 2,515,071,638 cycles # 2.678 GHz + 6,112,307,770 instructions # 2.43 insn per cycle + 0.939956946 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5042) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133133105791558 +Relative difference = 2.197525641713777e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.455132e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.498332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498332e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.419401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.462974e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.462974e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.146003 sec -INFO: No Floating Point Exceptions have been reported - 2,072,911,951 cycles # 1.804 GHz - 3,391,607,808 instructions # 1.64 insn per cycle - 1.149850121 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2237) (512y: 37) (512z: 3789) +TOTAL : 1.175080 sec + 2,043,785,731 cycles # 1.734 GHz + 3,407,766,286 instructions # 1.67 insn per cycle + 1.179120501 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2112) (512y: 5) (512z: 4366) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133163777514426 +Relative difference = 2.672802420482638e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 89f1af02c0..0e323dea81 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,275 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:38:49 +DATE: 2025-09-24_08:57:00 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.430077e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.496267e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.496267e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.468595 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,993,529,576 cycles # 2.878 GHz - 2,894,144,626 instructions # 1.45 insn per cycle - 0.749153323 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +EvtsPerSec[Rmb+ME] (23) = ( 1.804908e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.563811e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.563811e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.009070e+02 +- 5.002294e+01 ) GeV^-2 +TOTAL : 0.529866 sec + 2,189,527,812 cycles # 2.825 GHz + 3,234,000,738 instructions # 1.48 insn per cycle + 0.832340190 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +==PROF== Profiling "diagram1": launch__registers_per_thread 70 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.508973e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.254431e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254431e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737499e+02 +- 4.776369e+02 ) GeV^-2 -TOTAL : 0.658615 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,580,648,777 cycles # 2.888 GHz - 3,894,936,658 instructions # 1.51 insn per cycle - 0.952346890 seconds time elapsed +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214305330990 -Relative difference = 0.0004349621183379836 +Avg ME (F77/GPU) = 1.4132215053590471 +Relative difference = 0.0004350150884479323 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.506368e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.519408e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.519408e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.559975 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 19,280,512,048 cycles # 2.938 GHz - 59,619,141,119 instructions # 3.09 insn per cycle - 6.564243260 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 959) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.397563e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.409734e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.409734e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002644e+01 ) GeV^-2 +TOTAL : 6.857624 sec + 19,700,274,444 cycles # 2.872 GHz + 60,456,884,845 instructions # 3.07 insn per cycle + 6.861762459 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1323) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129950871652284 +Relative difference = 6.168827799708488e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.092271e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.230160e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.230160e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.707084e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.840722e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.840722e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.047307 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,043,775,465 cycles # 2.947 GHz - 17,111,089,922 instructions # 2.83 insn per cycle - 2.051614364 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5856) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.149611 sec + 6,176,579,184 cycles # 2.869 GHz + 17,549,611,255 instructions # 2.84 insn per cycle + 2.153671487 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6412) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954647353316 -Relative difference = 3.2890090308261873e-07 +Avg ME (F77/C++) = 1.4129953611754331 +Relative difference = 2.5560984512808326e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.748354e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.809701e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.809701e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.618402e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.675763e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.675763e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.959425 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,659,679,748 cycles # 2.761 GHz - 6,224,393,438 instructions # 2.34 insn per cycle - 0.963869172 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5105) (512y: 0) (512z: 0) +TOTAL : 1.036238 sec + 2,770,490,321 cycles # 2.665 GHz + 6,480,361,312 instructions # 2.34 insn per cycle + 1.040373935 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5176) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133133105791558 +Relative difference = 2.197525641713777e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.927524e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.002486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.002486e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.776625e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.846255e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.846255e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.872058 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,421,094,163 cycles # 2.765 GHz - 5,826,830,021 instructions # 2.41 insn per cycle - 0.876372578 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4906) (512y: 37) (512z: 0) +TOTAL : 0.946083 sec + 2,541,484,609 cycles # 2.676 GHz + 6,148,267,017 instructions # 2.42 insn per cycle + 0.950364845 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5042) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133133105791558 +Relative difference = 2.197525641713777e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.443486e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.486864e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.486864e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.412409e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.456214e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.456214e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.160150 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,098,432,349 cycles # 1.804 GHz - 3,433,067,927 instructions # 1.64 insn per cycle - 1.164579445 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2237) (512y: 37) (512z: 3789) +TOTAL : 1.186787 sec + 2,071,054,443 cycles # 1.740 GHz + 3,448,257,390 instructions # 1.66 insn per cycle + 1.191043055 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2112) (512y: 5) (512z: 4366) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133163777514426 +Relative difference = 2.672802420482638e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 7537d3c84d..09a2e62956 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:02:18 +DATE: 2025-09-24_07:51:10 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.658659e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.027503e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.066373e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.611135e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.699057e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.706506e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.462988 sec -INFO: No Floating Point Exceptions have been reported - 1,956,715,427 cycles # 2.872 GHz - 2,757,694,861 instructions # 1.41 insn per cycle - 0.742544959 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.520881 sec + 2,178,752,390 cycles # 2.830 GHz + 3,100,668,964 instructions # 1.42 insn per cycle + 0.830191295 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.669827e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.371215e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.415741e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 -TOTAL : 0.505811 sec -INFO: No Floating Point Exceptions have been reported - 2,123,611,289 cycles # 2.883 GHz - 3,083,974,467 instructions # 1.45 insn per cycle - 0.793454464 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 70 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.412607e+00 -Avg ME (F77/GPU) = 1.4132214305330990 -Relative difference = 0.0004349621183379836 +Avg ME (F77/GPU) = 1.4132215053590471 +Relative difference = 0.0004350150884479323 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.488365e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.501255e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.501255e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.602997 sec -INFO: No Floating Point Exceptions have been reported - 19,409,400,884 cycles # 2.938 GHz - 59,351,848,666 instructions # 3.06 insn per cycle - 6.606759387 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.400136e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.412501e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.412501e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002644e+01 ) GeV^-2 +TOTAL : 6.845560 sec + 19,647,886,375 cycles # 2.869 GHz + 60,294,728,788 instructions # 3.07 insn per cycle + 6.849528810 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1290) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129949096991936 -Relative difference = 6.390737857384068e-08 +Avg ME (F77/C++) = 1.4129950871652284 +Relative difference = 6.168827799708488e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.484090e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.633368e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.633368e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.744896e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.876711e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.876711e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.948837 sec -INFO: No Floating Point Exceptions have been reported - 5,764,162,956 cycles # 2.953 GHz - 16,849,716,772 instructions # 2.92 insn per cycle - 1.952678468 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5610) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.133454 sec + 6,138,416,779 cycles # 2.873 GHz + 17,495,238,901 instructions # 2.85 insn per cycle + 2.137501353 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6406) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.412995e+00 -Avg ME (F77/C++) = 1.4129954647353316 -Relative difference = 3.2890090308261873e-07 +Avg ME (F77/C++) = 1.4129953611754331 +Relative difference = 2.5560984512808326e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.522405e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.569181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.569181e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.626225e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.681915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.681915e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.094041 sec -INFO: No Floating Point Exceptions have been reported - 3,018,102,108 cycles # 2.750 GHz - 6,848,568,360 instructions # 2.27 insn per cycle - 1.098202042 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5735) (512y: 0) (512z: 0) +TOTAL : 1.026128 sec + 2,752,520,788 cycles # 2.674 GHz + 6,440,444,830 instructions # 2.34 insn per cycle + 1.030155534 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5154) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133133105791558 +Relative difference = 2.197525641713777e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.654265e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.710055e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.710055e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.754480e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.821417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.821417e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.008735 sec -INFO: No Floating Point Exceptions have been reported - 2,794,533,058 cycles # 2.762 GHz - 6,437,695,564 instructions # 2.30 insn per cycle - 1.012558685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5509) (512y: 23) (512z: 0) +TOTAL : 0.952423 sec + 2,524,001,981 cycles # 2.641 GHz + 6,108,556,264 instructions # 2.42 insn per cycle + 0.956460804 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5018) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413313e+00 -Avg ME (F77/C++) = 1.4133132969790267 -Relative difference = 2.1012969292986113e-07 +Avg ME (F77/C++) = 1.4133133105791558 +Relative difference = 2.197525641713777e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.323435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.360072e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.360072e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.412561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.455444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.455444e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.258302 sec -INFO: No Floating Point Exceptions have been reported - 2,251,923,496 cycles # 1.787 GHz - 3,755,291,572 instructions # 1.67 insn per cycle - 1.262174564 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2467) (512y: 28) (512z: 4084) +TOTAL : 1.180491 sec + 2,041,439,261 cycles # 1.725 GHz + 3,405,273,640 instructions # 1.67 insn per cycle + 1.184475245 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2088) (512y: 5) (512z: 4366) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133163777514426 +Relative difference = 2.672802420482638e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..c595edebb2 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + +DATE: 2025-09-24_08:18:19 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +1.142195e+05 1 256 +2.043397e+05 2 256 +3.835250e+05 4 256 +7.446403e+05 8 256 +1.295320e+06 16 256 +1.336801e+06 32 256 +1.135266e+06 64 256 +1.147914e+06 128 256 +1.151761e+06 256 256 +1.173317e+06 512 256 +1.173181e+06 1024 256 +### GPU: scaling test 32 +1.750501e+04 1 32 +3.464900e+04 2 32 +6.628367e+04 4 32 +1.319968e+05 8 32 +2.510299e+05 16 32 +4.434520e+05 32 32 +7.899604e+05 64 32 +1.287919e+06 128 32 +1.372098e+06 256 32 +1.120965e+06 512 32 +1.117757e+06 1024 32 +1.107063e+06 2048 32 +1.136625e+06 4096 32 +1.131581e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.271106e+04 1 256 +2.290369e+04 2 256 +2.311083e+04 4 256 +### CPU: scaling test 32 +2.200591e+04 1 32 +2.184564e+04 2 32 +2.090239e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.213590e+04 1 256 +4.313308e+04 2 256 +4.335077e+04 4 256 +### CPU: scaling test 32 +4.068659e+04 1 32 +4.056430e+04 2 32 +4.125233e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.011697e+04 1 256 +8.566827e+04 2 256 +8.574349e+04 4 256 +### CPU: scaling test 32 +7.598478e+04 1 32 +8.088826e+04 2 32 +8.164364e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.891285e+04 1 256 +8.920943e+04 2 256 +9.219800e+04 4 256 +### CPU: scaling test 32 +9.814686e+04 1 32 +9.351694e+04 2 32 +8.970697e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.059681e+04 1 256 +7.092777e+04 2 256 +7.113395e+04 4 256 +### CPU: scaling test 32 +6.963531e+04 1 32 +7.054720e+04 2 32 +7.077145e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 6b4617ba56..87f352e7da 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:01:06 +DATE: 2025-09-24_07:49:26 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.531107e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.896113e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.014318e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111499e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140352e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.142296e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.482291 sec -INFO: No Floating Point Exceptions have been reported - 1,996,726,100 cycles # 2.869 GHz - 2,875,927,393 instructions # 1.44 insn per cycle - 0.757518934 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.606172 sec + 2,425,493,225 cycles # 2.823 GHz + 3,641,776,012 instructions # 1.50 insn per cycle + 0.915893126 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.039985e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.227093e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.238483e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.618226 sec -INFO: No Floating Point Exceptions have been reported - 2,476,524,825 cycles # 2.883 GHz - 3,787,822,568 instructions # 1.53 insn per cycle - 0.918414719 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 104 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 54 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/GPU) = 1.4131213823321340 +Relative difference = 4.3709450844674974e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.396101e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.408087e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.408087e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.285953e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.297043e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.297043e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.859626 sec -INFO: No Floating Point Exceptions have been reported - 20,206,369,377 cycles # 2.945 GHz - 60,950,595,896 instructions # 3.02 insn per cycle - 6.863727850 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1220) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.189052 sec + 20,678,030,694 cycles # 2.875 GHz + 61,089,007,408 instructions # 2.95 insn per cycle + 7.193260515 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.651759e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.695029e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.695029e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.317569e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.357386e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.357386e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.542669 sec -INFO: No Floating Point Exceptions have been reported - 10,470,195,857 cycles # 2.953 GHz - 30,822,635,750 instructions # 2.94 insn per cycle - 3.546724112 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5351) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.815341 sec + 10,965,720,375 cycles # 2.872 GHz + 31,668,670,846 instructions # 2.89 insn per cycle + 3.819469924 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5888) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213792564823 -Relative difference = 4.392710025734405e-07 +Avg ME (F77/C++) = 1.4131213813302705 +Relative difference = 4.3780348012864624e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.177717e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.345070e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.345070e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.640391e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.799298e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.799298e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.805877 sec -INFO: No Floating Point Exceptions have been reported - 4,960,900,655 cycles # 2.742 GHz - 11,360,293,322 instructions # 2.29 insn per cycle - 1.809915904 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4776) (512y: 0) (512z: 0) +TOTAL : 1.916985 sec + 5,119,164,674 cycles # 2.666 GHz + 11,787,953,551 instructions # 2.30 insn per cycle + 1.921105796 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4783) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.047166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.068679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.068679e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.502267e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.693578e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.693578e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.585052 sec -INFO: No Floating Point Exceptions have been reported - 4,379,448,731 cycles # 2.757 GHz - 10,610,063,505 instructions # 2.42 insn per cycle - 1.588995755 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4503) (512y: 84) (512z: 0) +TOTAL : 1.744888 sec + 4,669,653,692 cycles # 2.671 GHz + 11,149,149,838 instructions # 2.39 insn per cycle + 1.748994962 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4604) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.890582e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.987179e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.987179e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.008286e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.114228e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.114228e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.401138 sec -INFO: No Floating Point Exceptions have been reported - 4,243,505,288 cycles # 1.765 GHz - 6,171,567,257 instructions # 1.45 insn per cycle - 2.405218093 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2143) (512y: 116) (512z: 3653) +TOTAL : 2.360776 sec + 4,029,709,616 cycles # 1.705 GHz + 5,980,013,055 instructions # 1.48 insn per cycle + 2.365003134 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1534) (512y: 61) (512z: 4253) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 1a268fb0a6..b29ebb2181 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-10-06_09:01:31 +DATE: 2025-09-24_07:50:02 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.506525e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.876419e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.986419e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.084424e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111897e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113751e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.475723 sec -INFO: No Floating Point Exceptions have been reported - 1,989,777,196 cycles # 2.876 GHz - 2,865,221,599 instructions # 1.44 insn per cycle - 0.750464789 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.608218 sec + 2,430,751,971 cycles # 2.832 GHz + 3,656,805,473 instructions # 1.50 insn per cycle + 0.915307420 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.040967e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.229706e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.240646e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.612359 sec -INFO: No Floating Point Exceptions have been reported - 2,465,408,367 cycles # 2.885 GHz - 3,759,784,229 instructions # 1.53 insn per cycle - 0.914073870 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 100 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 54 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/GPU) = 1.4131213823321340 +Relative difference = 4.3709450844674974e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.395973e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.407808e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.407808e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.272960e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.283884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.283884e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.859771 sec -INFO: No Floating Point Exceptions have been reported - 20,239,178,144 cycles # 2.949 GHz - 61,173,779,461 instructions # 3.02 insn per cycle - 6.863706451 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1272) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.229703 sec + 20,531,510,190 cycles # 2.839 GHz + 60,852,281,799 instructions # 2.96 insn per cycle + 7.233768348 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1376) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.702334e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.747762e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.747762e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.310001e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.349804e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.349804e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.505938 sec -INFO: No Floating Point Exceptions have been reported - 10,333,154,234 cycles # 2.946 GHz - 30,534,348,115 instructions # 2.95 insn per cycle - 3.510016853 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5155) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.821568 sec + 10,990,977,687 cycles # 2.874 GHz + 31,654,070,133 instructions # 2.88 insn per cycle + 3.825620122 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213792564823 -Relative difference = 4.392710025734405e-07 +Avg ME (F77/C++) = 1.4131213813302705 +Relative difference = 4.3780348012864624e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.861323e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.018375e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.018375e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.617602e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.775102e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.775102e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.870783 sec -INFO: No Floating Point Exceptions have been reported - 5,160,894,050 cycles # 2.755 GHz - 11,875,310,688 instructions # 2.30 insn per cycle - 1.874839635 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4887) (512y: 0) (512z: 0) +TOTAL : 1.922108 sec + 5,136,210,329 cycles # 2.668 GHz + 11,781,523,745 instructions # 2.29 insn per cycle + 1.926086319 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4765) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.768245e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.957717e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.957717e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.469057e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.657949e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.657949e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.697611 sec -INFO: No Floating Point Exceptions have been reported - 4,679,050,155 cycles # 2.751 GHz - 11,168,862,734 instructions # 2.39 insn per cycle - 1.701628470 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4508) (512y: 239) (512z: 0) +TOTAL : 1.750928 sec + 4,685,020,284 cycles # 2.671 GHz + 11,142,278,840 instructions # 2.38 insn per cycle + 1.755079681 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4584) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213646773610 +Relative difference = 4.495879612249832e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.922687e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.020028e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.020028e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.046477e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.153047e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.153047e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.390116 sec -INFO: No Floating Point Exceptions have been reported - 4,256,907,095 cycles # 1.778 GHz - 6,411,350,564 instructions # 1.51 insn per cycle - 2.394737171 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2039) (512y: 162) (512z: 3731) +TOTAL : 2.348086 sec + 4,023,618,226 cycles # 1.712 GHz + 5,976,028,235 instructions # 1.49 insn per cycle + 2.352309565 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1514) (512y: 61) (512z: 4253) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..3c6d7aa243 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:19:04 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.827501e+03 1 256 +9.434129e+03 2 256 +1.859154e+04 4 256 +3.692058e+04 8 256 +7.253540e+04 16 256 +1.151504e+05 32 256 +1.046277e+05 64 256 +1.035386e+05 128 256 +1.074867e+05 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.220421e+02 1 32 +1.240495e+03 2 32 +2.460087e+03 4 32 +4.886577e+03 8 32 +9.554076e+03 16 32 +1.877130e+04 32 32 +3.724483e+04 64 32 +7.268310e+04 128 32 +1.157858e+05 256 32 +1.040650e+05 512 32 +1.010411e+05 1024 32 +1.041504e+05 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.780453e+03 1 256 +1.786081e+03 2 256 +1.788120e+03 4 256 +### CPU: scaling test 32 +1.774852e+03 1 32 +1.777928e+03 2 32 +1.779310e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.203589e+03 1 256 +3.221417e+03 2 256 +3.235841e+03 4 256 +### CPU: scaling test 32 +3.128414e+03 1 32 +3.152927e+03 2 32 +3.182204e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.284645e+03 1 256 +7.272752e+03 2 256 +7.332492e+03 4 256 +### CPU: scaling test 32 +6.902093e+03 1 32 +6.845651e+03 2 32 +6.849626e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.152222e+03 1 256 +8.186030e+03 2 256 +8.253627e+03 4 256 +### CPU: scaling test 32 +8.324421e+03 1 32 +7.984412e+03 2 32 +7.747794e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.579293e+03 1 256 +6.584524e+03 2 256 +6.644529e+03 4 256 +### CPU: scaling test 32 +6.545181e+03 1 32 +6.515841e+03 2 32 +6.481920e+03 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index fe9e9669c6..445a870ebb 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:02:40 +DATE: 2025-09-24_07:51:45 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.331120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.359202e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.361250e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.042151e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045391e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.543918 sec -INFO: No Floating Point Exceptions have been reported - 2,225,694,406 cycles # 2.884 GHz - 3,483,451,829 instructions # 1.57 insn per cycle - 0.837015502 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.657631 sec + 2,689,336,386 cycles # 2.825 GHz + 4,292,001,683 instructions # 1.60 insn per cycle + 1.009047849 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134422e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164730e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165914e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.052190 sec -INFO: No Floating Point Exceptions have been reported - 9,689,726,748 cycles # 2.928 GHz - 22,118,867,491 instructions # 2.28 insn per cycle - 3.368998161 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.884002e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.884932e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.884932e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.775467e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.776300e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.776300e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.713540 sec -INFO: No Floating Point Exceptions have been reported - 25,683,805,881 cycles # 2.947 GHz - 78,963,253,936 instructions # 3.07 insn per cycle - 8.717598721 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.243774 sec + 26,559,464,941 cycles # 2.873 GHz + 80,664,436,977 instructions # 3.04 insn per cycle + 9.247966768 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141133E-004 -Relative difference = 2.8372990776517314e-07 +Avg ME (F77/C++) = 6.6266731198141155E-004 +Relative difference = 2.8372990743794954e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.540501e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.543820e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.543820e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.220602e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.223269e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.223269e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.639000 sec -INFO: No Floating Point Exceptions have been reported - 13,090,618,968 cycles # 2.820 GHz - 39,561,040,325 instructions # 3.02 insn per cycle - 4.644193645 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.098637 sec + 14,012,498,003 cycles # 2.747 GHz + 41,137,206,711 instructions # 2.94 insn per cycle + 5.102679505 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141122E-004 -Relative difference = 2.837299079287849e-07 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.087246e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.103223e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.103223e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.303668e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.317915e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.317915e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.035017 sec -INFO: No Floating Point Exceptions have been reported - 5,608,597,608 cycles # 2.752 GHz - 13,825,354,537 instructions # 2.47 insn per cycle - 2.039075619 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.252099 sec + 6,012,721,565 cycles # 2.666 GHz + 14,678,984,496 instructions # 2.44 insn per cycle + 2.256373750 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14881) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.190120e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.211201e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.211201e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.188749e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.206799e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.206799e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.791765 sec -INFO: No Floating Point Exceptions have been reported - 4,921,067,926 cycles # 2.743 GHz - 12,507,200,724 instructions # 2.54 insn per cycle - 1.798123347 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 2.009058 sec + 5,331,599,497 cycles # 2.649 GHz + 13,558,732,519 instructions # 2.54 insn per cycle + 2.013187817 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14564) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.012553e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.024911e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.024911e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.537614e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.549163e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.549163e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.347251 sec -INFO: No Floating Point Exceptions have been reported - 4,147,263,675 cycles # 1.765 GHz - 6,394,266,077 instructions # 1.54 insn per cycle - 2.352573303 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.515421 sec + 4,232,878,333 cycles # 1.681 GHz + 6,889,867,377 instructions # 1.63 insn per cycle + 2.519616006 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1767) (512y: 61) (512z:14451) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..abcd7d198a --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:38:25 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.431408e+03 1 256 +8.665003e+03 2 256 +1.697877e+04 4 256 +3.338923e+04 8 256 +6.548357e+04 16 256 +1.044894e+05 32 256 +9.824208e+04 64 256 +9.911505e+04 128 256 +1.037671e+05 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +5.663582e+02 1 32 +1.127328e+03 2 32 +2.249680e+03 4 32 +4.449069e+03 8 32 +8.698388e+03 16 32 +1.716415e+04 32 32 +3.362489e+04 64 32 +6.573394e+04 128 32 +1.057867e+05 256 32 +9.794711e+04 512 32 +9.709625e+04 1024 32 +1.012537e+05 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.785872e+03 1 256 +1.785429e+03 2 256 +1.786204e+03 4 256 +### CPU: scaling test 32 +1.766508e+03 1 32 +1.772286e+03 2 32 +1.775821e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.247136e+03 1 256 +3.244847e+03 2 256 +3.227010e+03 4 256 +### CPU: scaling test 32 +3.152260e+03 1 32 +3.180242e+03 2 32 +3.227514e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.322106e+03 1 256 +7.304087e+03 2 256 +7.292000e+03 4 256 +### CPU: scaling test 32 +6.643951e+03 1 32 +6.794845e+03 2 32 +7.028915e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.142400e+03 1 256 +8.227677e+03 2 256 +8.295040e+03 4 256 +### CPU: scaling test 32 +8.162056e+03 1 32 +8.206971e+03 2 32 +8.075921e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.655188e+03 1 256 +6.639534e+03 2 256 +6.652549e+03 4 256 +### CPU: scaling test 32 +6.476613e+03 1 32 +6.567240e+03 2 32 +6.717274e+03 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..ffa58d7677 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:33:16 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.820656e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.841888e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.843656e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.356954 sec + 4,918,574,281 cycles # 2.839 GHz + 6,948,443,410 instructions # 1.41 insn per cycle + 1.793979006 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626675e-04 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.777070e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.777900e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.777900e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.235300 sec + 26,537,227,132 cycles # 2.873 GHz + 80,663,724,754 instructions # 3.04 insn per cycle + 9.239490689 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198141155E-004 +Relative difference = 2.8372990743794954e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.219777e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.222590e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.222590e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.099665 sec + 14,039,633,443 cycles # 2.752 GHz + 41,136,919,428 instructions # 2.93 insn per cycle + 5.103868350 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.313055e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.326878e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.326878e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.249075 sec + 5,996,219,788 cycles # 2.662 GHz + 14,679,465,756 instructions # 2.45 insn per cycle + 2.253155059 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14881) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.261945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.280295e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.280295e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.991347 sec + 5,330,092,584 cycles # 2.673 GHz + 13,558,700,933 instructions # 2.54 insn per cycle + 1.995510774 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14564) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.545564e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.556815e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.556815e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.512636 sec + 4,237,115,233 cycles # 1.685 GHz + 6,890,325,988 instructions # 1.63 insn per cycle + 2.516727175 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1767) (512y: 61) (512z:14451) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index bc0987eea5..72edeef53a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,272 +10,220 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:39:37 +DATE: 2025-09-24_08:58:18 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.954093e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.263620e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.263620e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.997819e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033734e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.526732 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,227,837,882 cycles # 2.883 GHz - 3,476,505,124 instructions # 1.56 insn per cycle - 0.832118305 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.650712 sec + 2,666,339,708 cycles # 2.832 GHz + 4,289,519,507 instructions # 1.61 insn per cycle + 1.001668457 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.643761e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.124122e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.124122e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.301805 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 10,501,615,955 cycles # 2.935 GHz - 23,489,948,913 instructions # 2.24 insn per cycle - 3.634545913 seconds time elapsed +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.879294e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.880182e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880182e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.777512e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.778338e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.778338e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.737845 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 25,658,913,414 cycles # 2.936 GHz - 78,963,594,343 instructions # 3.08 insn per cycle - 8.742435740 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.237890 sec + 26,569,728,048 cycles # 2.876 GHz + 80,669,309,749 instructions # 3.04 insn per cycle + 9.242234035 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141133E-004 -Relative difference = 2.8372990776517314e-07 +Avg ME (F77/C++) = 6.6266731198141155E-004 +Relative difference = 2.8372990743794954e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.518464e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.521735e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.521735e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.302860e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.305691e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.305691e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.671849 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 13,102,544,659 cycles # 2.802 GHz - 39,572,381,519 instructions # 3.02 insn per cycle - 4.676455621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.976384 sec + 14,071,437,640 cycles # 2.826 GHz + 41,149,571,507 instructions # 2.92 insn per cycle + 4.980974280 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141122E-004 -Relative difference = 2.837299079287849e-07 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.057114e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.073561e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.073561e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.261515e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.275168e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.275168e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.046600 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 5,627,314,455 cycles # 2.744 GHz - 13,834,298,777 instructions # 2.46 insn per cycle - 2.051219882 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.269473 sec + 6,044,083,589 cycles # 2.659 GHz + 14,689,689,145 instructions # 2.43 insn per cycle + 2.274051232 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14881) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.239341e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.261385e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.261385e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.269552e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.288361e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.288361e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.786219 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,931,565,389 cycles # 2.756 GHz - 12,515,991,121 instructions # 2.54 insn per cycle - 1.790909503 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 1.994141 sec + 5,344,979,439 cycles # 2.675 GHz + 13,569,216,111 instructions # 2.54 insn per cycle + 1.998658815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14564) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.038188e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.051446e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.051446e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.570403e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.581912e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.581912e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.341272 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,150,945,217 cycles # 1.770 GHz - 6,403,675,117 instructions # 1.54 insn per cycle - 2.345955468 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.507538 sec + 4,251,930,707 cycles # 1.693 GHz + 6,900,759,684 instructions # 1.62 insn per cycle + 2.512191271 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1767) (512y: 61) (512z:14451) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index be31042fc1..f735859252 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:50:33 +DATE: 2025-09-24_09:15:41 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.314159e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.339458e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.341417e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039258e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.039455e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.518126 sec -INFO: No Floating Point Exceptions have been reported - 2,164,802,026 cycles # 2.881 GHz - 3,409,915,390 instructions # 1.58 insn per cycle - 0.811338657 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.641624 sec + 2,661,941,432 cycles # 2.834 GHz + 4,378,783,837 instructions # 1.64 insn per cycle + 0.996435485 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134613e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.165487e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.166746e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.140406 sec -INFO: No Floating Point Exceptions have been reported - 9,973,053,404 cycles # 2.934 GHz - 20,986,544,572 instructions # 2.10 insn per cycle - 3.455765313 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.884135e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.885033e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.885033e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.774602e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.775455e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.775455e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.712529 sec -INFO: No Floating Point Exceptions have been reported - 25,691,717,185 cycles # 2.948 GHz - 78,960,325,856 instructions # 3.07 insn per cycle - 8.716734440 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.249876 sec + 26,570,238,887 cycles # 2.872 GHz + 80,663,446,059 instructions # 3.04 insn per cycle + 9.253723929 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141133E-004 -Relative difference = 2.8372990776517314e-07 +Avg ME (F77/C++) = 6.6266731198141155E-004 +Relative difference = 2.8372990743794954e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.543458e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.546697e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.546697e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.215926e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.218618e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.218618e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.636367 sec -INFO: No Floating Point Exceptions have been reported - 13,067,183,546 cycles # 2.816 GHz - 39,558,454,763 instructions # 3.03 insn per cycle - 4.640590687 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.107568 sec + 14,024,060,474 cycles # 2.744 GHz + 41,136,719,909 instructions # 2.93 insn per cycle + 5.111745095 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141122E-004 -Relative difference = 2.837299079287849e-07 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.084806e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.101064e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.101064e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.317059e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.331542e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.331542e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.036679 sec -INFO: No Floating Point Exceptions have been reported - 5,613,470,524 cycles # 2.752 GHz - 13,823,796,455 instructions # 2.46 insn per cycle - 2.040900437 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.249705 sec + 6,011,566,666 cycles # 2.669 GHz + 14,678,181,798 instructions # 2.44 insn per cycle + 2.253767973 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14881) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.198723e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.219905e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.219905e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.063615e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.080699e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.080699e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.791160 sec -INFO: No Floating Point Exceptions have been reported - 4,922,288,820 cycles # 2.743 GHz - 12,503,388,745 instructions # 2.54 insn per cycle - 1.795321275 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 2.041817 sec + 5,403,229,519 cycles # 2.642 GHz + 13,557,173,256 instructions # 2.51 insn per cycle + 2.045882374 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14564) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.975365e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.987686e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.987686e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.516082e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.527321e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.527321e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.359532 sec -INFO: No Floating Point Exceptions have been reported - 4,155,009,705 cycles # 1.759 GHz - 6,390,945,346 instructions # 1.54 insn per cycle - 2.363732897 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.525144 sec + 4,244,374,081 cycles # 1.679 GHz + 6,887,965,141 instructions # 1.62 insn per cycle + 2.529266849 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1767) (512y: 61) (512z:14451) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 437b6b7cbd..359d241c33 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:47:41 +DATE: 2025-09-24_09:11:37 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.310053e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.334627e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.336677e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039836e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.042417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.042618e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.518612 sec -INFO: No Floating Point Exceptions have been reported - 2,156,837,380 cycles # 2.875 GHz - 3,433,389,555 instructions # 1.59 insn per cycle - 0.811650542 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.640741 sec + 2,653,679,481 cycles # 2.830 GHz + 4,331,233,604 instructions # 1.63 insn per cycle + 0.994615706 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.128944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.159258e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.160487e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.091523 sec -INFO: No Floating Point Exceptions have been reported - 9,825,563,648 cycles # 2.933 GHz - 22,802,776,931 instructions # 2.32 insn per cycle - 3.405923259 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.890035e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890938e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890938e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.779631e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.780498e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.780498e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.683864 sec -INFO: No Floating Point Exceptions have been reported - 25,635,022,031 cycles # 2.951 GHz - 78,960,809,140 instructions # 3.08 insn per cycle - 8.688143049 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.221957 sec + 26,521,385,152 cycles # 2.875 GHz + 80,663,083,749 instructions # 3.04 insn per cycle + 9.226164416 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141133E-004 -Relative difference = 2.8372990776517314e-07 +Avg ME (F77/C++) = 6.6266731198141155E-004 +Relative difference = 2.8372990743794954e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.535619e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.538805e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.538805e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.188674e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.191422e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.191422e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.644682 sec -INFO: No Floating Point Exceptions have been reported - 13,070,212,228 cycles # 2.812 GHz - 39,558,910,913 instructions # 3.03 insn per cycle - 4.648863484 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.149366 sec + 14,036,040,292 cycles # 2.724 GHz + 41,137,477,026 instructions # 2.93 insn per cycle + 5.153371013 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141122E-004 -Relative difference = 2.837299079287849e-07 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.974136e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.989764e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.989764e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.310052e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.323851e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.323851e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.062978 sec -INFO: No Floating Point Exceptions have been reported - 5,609,565,523 cycles # 2.715 GHz - 13,823,736,601 instructions # 2.46 insn per cycle - 2.067208066 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.249818 sec + 6,006,737,973 cycles # 2.666 GHz + 14,679,070,171 instructions # 2.44 insn per cycle + 2.254048539 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14881) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.256862e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.278276e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.278276e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.198807e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.216988e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.216988e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.778135 sec -INFO: No Floating Point Exceptions have been reported - 4,913,104,520 cycles # 2.758 GHz - 12,505,156,898 instructions # 2.55 insn per cycle - 1.782374042 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 2.006666 sec + 5,349,449,687 cycles # 2.662 GHz + 13,558,519,738 instructions # 2.53 insn per cycle + 2.010742077 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14564) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.040533e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.053211e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.053211e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.526022e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.537398e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.537398e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.335968 sec -INFO: No Floating Point Exceptions have been reported - 4,137,289,106 cycles # 1.769 GHz - 6,392,511,975 instructions # 1.55 insn per cycle - 2.340416062 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.519591 sec + 4,240,128,814 cycles # 1.681 GHz + 6,889,716,156 instructions # 1.62 insn per cycle + 2.523839777 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1767) (512y: 61) (512z:14451) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..aaf196e26a --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_09:22:00 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.046882e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049339e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049531e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.658941 sec + 2,619,370,753 cycles # 2.830 GHz + 4,315,879,983 instructions # 1.65 insn per cycle + 0.985069394 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626675e-04 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.777719e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.778564e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.778564e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.231985 sec + 26,530,286,433 cycles # 2.873 GHz + 80,663,160,739 instructions # 3.04 insn per cycle + 9.235966447 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198141155E-004 +Relative difference = 2.8372990743794954e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.214661e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.217440e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.217440e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.107602 sec + 14,019,762,133 cycles # 2.743 GHz + 41,137,185,286 instructions # 2.93 insn per cycle + 5.111806436 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.325470e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.339668e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.339668e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.245120 sec + 6,000,634,007 cycles # 2.669 GHz + 14,678,782,792 instructions # 2.45 insn per cycle + 2.249347946 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14881) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.223501e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.241227e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.241227e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.000737 sec + 5,334,637,803 cycles # 2.662 GHz + 13,558,380,332 instructions # 2.54 insn per cycle + 2.004961011 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14564) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.549501e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.560889e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.560889e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.510605 sec + 4,244,550,697 cycles # 1.689 GHz + 6,889,807,087 instructions # 1.62 insn per cycle + 2.514890598 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1767) (512y: 61) (512z:14451) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index f2b15e4b6f..357c24f245 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,216 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:44:55 +DATE: 2025-09-24_09:07:38 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.041462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.325366e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.327398e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.010689e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.040832e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.041036e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520118 sec -INFO: No Floating Point Exceptions have been reported - 2,177,158,293 cycles # 2.891 GHz - 3,464,316,990 instructions # 1.59 insn per cycle - 0.812097316 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.647797 sec + 2,659,414,719 cycles # 2.832 GHz + 4,321,757,984 instructions # 1.63 insn per cycle + 0.998229949 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP= +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.734798e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.174453e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.175668e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.213650 sec -INFO: No Floating Point Exceptions have been reported - 10,150,922,529 cycles # 2.918 GHz - 23,231,659,490 instructions # 2.29 insn per cycle - 3.538737264 seconds time elapsed +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.885407e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.886309e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.886309e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.777828e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.778669e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.778669e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.705137 sec -INFO: No Floating Point Exceptions have been reported - 25,650,530,800 cycles # 2.946 GHz - 78,960,008,246 instructions # 3.08 insn per cycle - 8.709419634 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.231282 sec + 26,561,084,123 cycles # 2.876 GHz + 80,664,781,232 instructions # 3.04 insn per cycle + 9.235283514 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141133E-004 -Relative difference = 2.8372990776517314e-07 +Avg ME (F77/C++) = 6.6266731198141155E-004 +Relative difference = 2.8372990743794954e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.551750e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.554937e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.554937e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.218246e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.221081e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.221081e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.623453 sec -INFO: No Floating Point Exceptions have been reported - 13,056,946,389 cycles # 2.822 GHz - 39,559,090,760 instructions # 3.03 insn per cycle - 4.627712527 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.102107 sec + 14,046,126,931 cycles # 2.752 GHz + 41,137,219,329 instructions # 2.93 insn per cycle + 5.106189798 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141122E-004 -Relative difference = 2.837299079287849e-07 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.090893e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.106933e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.106933e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.245451e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.259263e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.259263e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.033338 sec -INFO: No Floating Point Exceptions have been reported - 5,609,780,879 cycles # 2.754 GHz - 13,824,722,765 instructions # 2.46 insn per cycle - 2.037509617 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11520) (512y: 0) (512z: 0) +TOTAL : 2.270158 sec + 6,034,991,732 cycles # 2.655 GHz + 14,679,285,512 instructions # 2.43 insn per cycle + 2.274258617 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14881) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.188897e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.209893e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.209893e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.183410e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.201402e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.201402e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.791081 sec -INFO: No Floating Point Exceptions have been reported - 4,916,057,270 cycles # 2.740 GHz - 12,505,186,935 instructions # 2.54 insn per cycle - 1.795355106 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10439) (512y: 89) (512z: 0) +TOTAL : 2.010584 sec + 5,334,939,311 cycles # 2.649 GHz + 13,558,672,256 instructions # 2.54 insn per cycle + 2.014695525 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14564) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.019116e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.031683e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.031683e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.571999e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.583351e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.583351e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.343107 sec -INFO: No Floating Point Exceptions have been reported - 4,136,898,273 cycles # 1.763 GHz - 6,392,336,539 instructions # 1.55 insn per cycle - 2.347534329 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1978) (512y: 101) (512z: 9386) +TOTAL : 2.502044 sec + 4,233,681,993 cycles # 1.690 GHz + 6,889,572,033 instructions # 1.63 insn per cycle + 2.506227900 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1767) (512y: 61) (512z:14451) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 99e413a8a3..29ef532558 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:03:14 +DATE: 2025-09-24_07:52:38 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.332738e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.357821e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.359802e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.039395e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.542209 sec -INFO: No Floating Point Exceptions have been reported - 2,220,139,727 cycles # 2.875 GHz - 3,465,138,857 instructions # 1.56 insn per cycle - 0.835706398 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.662823 sec + 2,690,649,108 cycles # 2.828 GHz + 4,286,045,440 instructions # 1.59 insn per cycle + 1.013510505 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.145716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.176488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.177708e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.039240 sec -INFO: No Floating Point Exceptions have been reported - 9,630,090,535 cycles # 2.918 GHz - 21,945,170,652 instructions # 2.28 insn per cycle - 3.356721463 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.881580e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.882499e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.882499e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.781146e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.781991e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.781991e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.723377 sec -INFO: No Floating Point Exceptions have been reported - 25,611,709,249 cycles # 2.935 GHz - 78,703,444,126 instructions # 3.07 insn per cycle - 8.727502935 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4191) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.213856 sec + 26,488,697,165 cycles # 2.874 GHz + 80,598,102,732 instructions # 3.04 insn per cycle + 9.217833797 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6108) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141133E-004 -Relative difference = 2.8372990776517314e-07 +Avg ME (F77/C++) = 6.6266731198141155E-004 +Relative difference = 2.8372990743794954e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.593581e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.596889e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.596889e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.190938e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.193680e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.193680e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.571814 sec -INFO: No Floating Point Exceptions have been reported - 13,039,592,628 cycles # 2.851 GHz - 39,453,086,877 instructions # 3.03 insn per cycle - 4.575893049 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12966) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.145449 sec + 14,122,597,527 cycles # 2.743 GHz + 41,123,270,337 instructions # 2.91 insn per cycle + 5.149556341 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141122E-004 -Relative difference = 2.837299079287849e-07 +Avg ME (F77/C++) = 6.6266731198141133E-004 +Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.986878e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.003760e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.003760e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.251338e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.264895e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.264895e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.061484 sec -INFO: No Floating Point Exceptions have been reported - 5,673,128,561 cycles # 2.749 GHz - 13,911,820,426 instructions # 2.45 insn per cycle - 2.066505881 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11582) (512y: 0) (512z: 0) +TOTAL : 2.268038 sec + 6,006,066,444 cycles # 2.645 GHz + 14,676,192,844 instructions # 2.44 insn per cycle + 2.272143843 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14857) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.098916e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.119150e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.119150e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.177622e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.195568e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.195568e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.809563 sec -INFO: No Floating Point Exceptions have been reported - 4,990,015,585 cycles # 2.753 GHz - 12,604,471,256 instructions # 2.53 insn per cycle - 1.813650628 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10423) (512y: 241) (512z: 0) +TOTAL : 2.011600 sec + 5,382,012,156 cycles # 2.671 GHz + 13,555,448,039 instructions # 2.52 insn per cycle + 2.015732823 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14538) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.910207e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.922434e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.922434e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.533539e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.545433e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.545433e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.380650 sec -INFO: No Floating Point Exceptions have been reported - 4,192,440,259 cycles # 1.759 GHz - 6,502,191,985 instructions # 1.55 insn per cycle - 2.384674618 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1754) (512y: 193) (512z: 9382) +TOTAL : 2.516570 sec + 4,253,387,112 cycles # 1.688 GHz + 6,887,935,639 instructions # 1.62 insn per cycle + 2.520747303 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1741) (512y: 61) (512z:14451) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 76362e2777..4f4356c48c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:30:00 +DATE: 2025-09-24_08:48:02 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.108959e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.129301e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.130870e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.042496e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045142e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045338e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.545749 sec -INFO: No Floating Point Exceptions have been reported - 2,205,865,001 cycles # 2.840 GHz - 3,412,138,367 instructions # 1.55 insn per cycle - 0.835130533 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.657301 sec + 2,688,164,890 cycles # 2.832 GHz + 4,316,171,739 instructions # 1.61 insn per cycle + 1.011009554 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.747537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.771352e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.772362e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.317305 sec -INFO: No Floating Point Exceptions have been reported - 10,470,225,400 cycles # 2.928 GHz - 22,893,642,046 instructions # 2.19 insn per cycle - 3.632348979 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.279433e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.279917e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.279917e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.749826e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.750647e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.750647e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 38.330200 sec -INFO: No Floating Point Exceptions have been reported - 112,786,835,820 cycles # 2.943 GHz - 144,812,254,859 instructions # 1.28 insn per cycle - 38.334547107 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:21273) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.379113 sec + 26,934,267,861 cycles # 2.871 GHz + 73,851,988,552 instructions # 2.74 insn per cycle + 9.383150461 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12960) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.132336e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.134792e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.134792e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.195954e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.198717e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.198717e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.242571 sec -INFO: No Floating Point Exceptions have been reported - 14,761,048,074 cycles # 2.814 GHz - 37,609,615,991 instructions # 2.55 insn per cycle - 5.246531710 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68172) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.137648 sec + 13,984,746,789 cycles # 2.720 GHz + 37,779,059,141 instructions # 2.70 insn per cycle + 5.141898165 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67022) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141231E-004 +Relative difference = 2.8372990629266697e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.367426e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.381363e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.381363e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.579100e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.594477e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.594477e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.233268 sec -INFO: No Floating Point Exceptions have been reported - 6,121,196,467 cycles # 2.737 GHz - 13,054,881,187 instructions # 2.13 insn per cycle - 2.237420808 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46946) (512y: 0) (512z: 0) +TOTAL : 2.170436 sec + 5,754,413,095 cycles # 2.647 GHz + 12,930,299,739 instructions # 2.25 insn per cycle + 2.174693824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:44526) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156811E-004 +Relative difference = 2.837296711825217e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.964974e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.985321e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.985321e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.763945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.784121e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.784121e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.836637 sec -INFO: No Floating Point Exceptions have been reported - 5,064,709,437 cycles # 2.753 GHz - 11,452,008,336 instructions # 2.26 insn per cycle - 1.840705951 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40486) (512y: 285) (512z: 0) +TOTAL : 1.877679 sec + 4,952,203,180 cycles # 2.633 GHz + 11,588,997,021 instructions # 2.34 insn per cycle + 1.881750512 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:39643) (512y: 55) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156811E-004 +Relative difference = 2.837296711825217e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.358991e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.372760e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.372760e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.037416e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.050913e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.050913e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.235964 sec -INFO: No Floating Point Exceptions have been reported - 3,956,538,826 cycles # 1.767 GHz - 5,928,749,634 instructions # 1.50 insn per cycle - 2.240037452 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 337) (512z:39338) +TOTAL : 2.337052 sec + 3,936,542,106 cycles # 1.682 GHz + 5,928,998,571 instructions # 1.51 insn per cycle + 2.341312783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1600) (512y: 64) (512z:38942) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156811E-004 +Relative difference = 2.837296711825217e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 5040f4b335..4a7bf9e008 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:31:09 +DATE: 2025-09-24_08:48:56 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.107076e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.130192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.131670e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039955e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.042489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.042684e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.539226 sec -INFO: No Floating Point Exceptions have been reported - 2,240,615,938 cycles # 2.902 GHz - 3,467,491,001 instructions # 1.55 insn per cycle - 0.828466018 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.659114 sec + 2,692,518,276 cycles # 2.825 GHz + 4,310,380,031 instructions # 1.60 insn per cycle + 1.010529095 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.751881e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.775679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.776668e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.303070 sec -INFO: No Floating Point Exceptions have been reported - 10,434,569,638 cycles # 2.930 GHz - 24,118,235,140 instructions # 2.31 insn per cycle - 3.617886016 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/GPU) = 6.6266731198158112E-004 +Relative difference = 2.837296515491067e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.241409e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.241886e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.241886e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.754652e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.755482e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.755482e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 38.674103 sec -INFO: No Floating Point Exceptions have been reported - 113,958,477,984 cycles # 2.947 GHz - 144,286,195,418 instructions # 1.27 insn per cycle - 38.678088373 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:21024) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.353085 sec + 26,827,161,725 cycles # 2.868 GHz + 73,583,039,012 instructions # 2.74 insn per cycle + 9.357114519 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12764) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.007169e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.009483e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.009483e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.209478e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.212226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.212226e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.460584 sec -INFO: No Floating Point Exceptions have been reported - 15,281,187,875 cycles # 2.797 GHz - 37,839,169,102 instructions # 2.48 insn per cycle - 5.464853538 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68594) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.115971 sec + 13,960,203,452 cycles # 2.727 GHz + 37,775,458,315 instructions # 2.71 insn per cycle + 5.120088126 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:66935) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141231E-004 +Relative difference = 2.8372990629266697e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.567317e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.582163e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.582163e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.651963e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.666829e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.666829e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.174218 sec -INFO: No Floating Point Exceptions have been reported - 6,020,206,289 cycles # 2.765 GHz - 12,923,983,464 instructions # 2.15 insn per cycle - 2.178219828 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46048) (512y: 0) (512z: 0) +TOTAL : 2.149782 sec + 5,705,194,110 cycles # 2.650 GHz + 12,866,644,870 instructions # 2.26 insn per cycle + 2.153952917 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:43855) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156811E-004 +Relative difference = 2.837296711825217e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.900478e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.920792e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.920792e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.882866e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.903987e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.903987e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.849478 sec -INFO: No Floating Point Exceptions have been reported - 5,102,330,026 cycles # 2.754 GHz - 11,453,366,172 instructions # 2.24 insn per cycle - 1.853513717 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40151) (512y: 219) (512z: 0) +TOTAL : 1.852637 sec + 4,930,923,995 cycles # 2.657 GHz + 11,551,223,645 instructions # 2.34 insn per cycle + 1.856813121 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:39128) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156811E-004 +Relative difference = 2.837296711825217e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.368242e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.382314e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.382314e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.061104e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.074586e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.074586e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.232876 sec -INFO: No Floating Point Exceptions have been reported - 3,951,515,189 cycles # 1.767 GHz - 5,896,746,544 instructions # 1.49 insn per cycle - 2.236852257 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1959) (512y: 259) (512z:38977) +TOTAL : 2.329299 sec + 3,932,028,853 cycles # 1.686 GHz + 5,912,324,948 instructions # 1.50 insn per cycle + 2.333629850 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1168) (512y: 48) (512z:38635) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156821E-004 +Relative difference = 2.8372967101890994e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..28731b3dfa --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:20:20 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.874571e+03 1 256 +9.668344e+03 2 256 +1.898243e+04 4 256 +3.892187e+04 8 256 +7.406167e+04 16 256 +1.450850e+05 32 256 +2.285874e+05 64 256 +2.300604e+05 128 256 +2.321676e+05 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.430243e+02 1 32 +1.275993e+03 2 32 +2.526380e+03 4 32 +5.085785e+03 8 32 +1.005979e+04 16 32 +1.929453e+04 32 32 +3.746383e+04 64 32 +7.436476e+04 128 32 +1.449171e+05 256 32 +2.233797e+05 512 32 +2.278699e+05 1024 32 +2.267486e+05 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.815758e+03 1 256 +1.824941e+03 2 256 +1.812890e+03 4 256 +### CPU: scaling test 32 +1.799421e+03 1 32 +1.811217e+03 2 32 +1.817109e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.694201e+03 1 256 +6.530240e+03 2 256 +6.618481e+03 4 256 +### CPU: scaling test 32 +6.203984e+03 1 32 +6.493493e+03 2 32 +6.399063e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.412108e+04 1 256 +1.422537e+04 2 256 +1.448658e+04 4 256 +### CPU: scaling test 32 +1.445264e+04 1 32 +1.450156e+04 2 32 +1.391386e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.559570e+04 1 256 +1.624796e+04 2 256 +1.592395e+04 4 256 +### CPU: scaling test 32 +1.467926e+04 1 32 +1.562225e+04 2 32 +1.527629e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.314281e+04 1 256 +1.318480e+04 2 256 +1.327322e+04 4 256 +### CPU: scaling test 32 +1.281594e+04 1 32 +1.304798e+04 2 32 +1.317554e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index c4676334b0..9767c1c873 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:04:57 +DATE: 2025-09-24_07:55:17 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.476973e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.519601e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.523500e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.264243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.271869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.272617e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.498075 sec -INFO: No Floating Point Exceptions have been reported - 2,049,620,143 cycles # 2.856 GHz - 3,058,097,989 instructions # 1.49 insn per cycle - 0.977244524 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.573086 sec + 2,350,779,156 cycles # 2.827 GHz + 3,450,243,943 instructions # 1.47 insn per cycle + 0.888014676 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.124860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.187008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.189727e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.797790 sec -INFO: No Floating Point Exceptions have been reported - 5,916,497,978 cycles # 2.910 GHz - 12,115,730,956 instructions # 2.05 insn per cycle - 2.090370837 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +Avg ME (F77/GPU) = 6.6262666367186696E-004 +Relative difference = 2.827504444018108e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.932981e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.933931e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.933931e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.809043e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.809885e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.809885e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.490769 sec -INFO: No Floating Point Exceptions have been reported - 24,922,868,630 cycles # 2.935 GHz - 79,110,265,707 instructions # 3.17 insn per cycle - 8.496015758 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.070907 sec + 26,076,793,123 cycles # 2.874 GHz + 81,082,815,652 instructions # 3.11 insn per cycle + 9.074860218 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865557072044E-004 +Relative difference = 6.703789776019192e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.975543e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.988298e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.988298e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.356100 sec -INFO: No Floating Point Exceptions have been reported - 6,536,263,436 cycles # 2.771 GHz - 20,271,266,485 instructions # 3.10 insn per cycle - 2.362378155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.599536e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.610894e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.610894e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060120e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.490195 sec + 6,817,962,157 cycles # 2.735 GHz + 21,064,372,112 instructions # 3.09 insn per cycle + 2.494235927 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +Avg ME (F77/C++) = 6.6274862240394555E-004 +Relative difference = 3.3804591304642774e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.588631e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.595153e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.595153e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.435208e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.440763e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.440763e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.038490 sec -INFO: No Floating Point Exceptions have been reported - 2,837,721,779 cycles # 2.726 GHz - 7,066,858,765 instructions # 2.49 insn per cycle - 1.044464831 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.147813 sec + 3,056,395,258 cycles # 2.655 GHz + 7,494,074,266 instructions # 2.45 insn per cycle + 1.151895136 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15414) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.762421e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.770702e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.770702e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.554794e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.561237e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.561237e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.936394 sec -INFO: No Floating Point Exceptions have been reported - 2,577,125,275 cycles # 2.745 GHz - 6,404,206,024 instructions # 2.49 insn per cycle - 0.941322355 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 1.059930 sec + 2,742,793,309 cycles # 2.580 GHz + 6,930,877,377 instructions # 2.53 insn per cycle + 1.063917437 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15142) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.409980e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.415034e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.415034e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.312360e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.316925e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.316925e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.170914 sec -INFO: No Floating Point Exceptions have been reported - 2,069,436,546 cycles # 1.766 GHz - 3,304,699,013 instructions # 1.60 insn per cycle - 1.174781391 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.254936 sec + 2,115,349,761 cycles # 1.681 GHz + 3,555,960,059 instructions # 1.68 insn per cycle + 1.258998790 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2404) (512y: 5) (512z:14466) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 +Avg ME (F77/C++) = 6.6271953539095291E-004 +Relative difference = 5.340261281526277e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..5aa2f01211 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:40:35 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.512528e+03 1 256 +8.976635e+03 2 256 +1.751058e+04 4 256 +3.430598e+04 8 256 +6.666715e+04 16 256 +1.291068e+05 32 256 +2.050430e+05 64 256 +2.107149e+05 128 256 +2.155656e+05 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +5.890329e+02 1 32 +1.152212e+03 2 32 +2.292040e+03 4 32 +4.632888e+03 8 32 +9.198270e+03 16 32 +1.769087e+04 32 32 +3.431557e+04 64 32 +6.676554e+04 128 32 +1.285799e+05 256 32 +2.026054e+05 512 32 +2.091692e+05 1024 32 +2.111970e+05 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.824032e+03 1 256 +1.809863e+03 2 256 +1.820597e+03 4 256 +### CPU: scaling test 32 +1.801488e+03 1 32 +1.816095e+03 2 32 +1.818887e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.541208e+03 1 256 +6.646471e+03 2 256 +6.674420e+03 4 256 +### CPU: scaling test 32 +6.654268e+03 1 32 +6.445369e+03 2 32 +6.483277e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.403915e+04 1 256 +1.421509e+04 2 256 +1.432262e+04 4 256 +### CPU: scaling test 32 +1.327148e+04 1 32 +1.383509e+04 2 32 +1.411012e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.518580e+04 1 256 +1.604878e+04 2 256 +1.599833e+04 4 256 +### CPU: scaling test 32 +1.493653e+04 1 32 +1.536784e+04 2 32 +1.504996e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.296020e+04 1 256 +1.323754e+04 2 256 +1.324766e+04 4 256 +### CPU: scaling test 32 +1.287590e+04 1 32 +1.305082e+04 2 32 +1.315256e+04 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..f26cad1106 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:35:23 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.037973e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.043230e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.043825e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 1.288940 sec + 4,705,960,611 cycles # 2.831 GHz + 6,563,046,338 instructions # 1.39 insn per cycle + 1.721562672 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626455e-04 +Avg ME (F77/GPU) = 6.6262664686948802E-004 +Relative difference = 2.845130693853636e-05 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.810761e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.811590e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.811590e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 9.062464 sec + 26,064,650,444 cycles # 2.875 GHz + 81,082,234,132 instructions # 3.11 insn per cycle + 9.066348887 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865557072044E-004 +Relative difference = 6.703789776019192e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.534787e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.546219e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.546219e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060120e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.514677 sec + 6,853,715,855 cycles # 2.722 GHz + 21,065,386,704 instructions # 3.07 insn per cycle + 2.518541485 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274862240394555E-004 +Relative difference = 3.3804591304642774e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.442665e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.448022e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.448022e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.142061 sec + 3,053,183,512 cycles # 2.666 GHz + 7,494,362,915 instructions # 2.45 insn per cycle + 1.145914831 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15414) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.601681e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.608402e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.608402e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.029063 sec + 2,740,562,821 cycles # 2.655 GHz + 6,930,786,130 instructions # 2.53 insn per cycle + 1.033186814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15142) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.312127e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.316575e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.316575e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.258103 sec + 2,118,915,872 cycles # 1.682 GHz + 3,556,524,861 instructions # 1.68 insn per cycle + 1.263934339 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2404) (512y: 5) (512z:14466) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271953539095291E-004 +Relative difference = 5.340261281526277e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index dec260c3af..ad335111b5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,275 +10,223 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:40:11 +DATE: 2025-09-24_08:59:11 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.924368e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.456718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.456718e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.147378e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.246468e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.246468e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.481369 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,011,468,293 cycles # 2.883 GHz - 2,972,689,221 instructions # 1.48 insn per cycle - 0.755097926 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.557991 sec + 2,292,595,764 cycles # 2.825 GHz + 3,436,278,321 instructions # 1.50 insn per cycle + 0.868439857 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.978465e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.128974e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.128974e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641709e+00 +- 4.994248e+00 ) GeV^-4 -TOTAL : 1.967107 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,502,759,539 cycles # 2.928 GHz - 13,854,302,325 instructions # 2.13 insn per cycle - 2.276466534 seconds time elapsed +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +Avg ME (F77/GPU) = 6.6262666367186696E-004 +Relative difference = 2.827504444018108e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.944212e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.945160e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.945160e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.810608e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.811460e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.811460e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.443358 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 24,934,407,175 cycles # 2.952 GHz - 79,115,502,595 instructions # 3.17 insn per cycle - 8.447759712 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.065521 sec + 26,088,065,408 cycles # 2.877 GHz + 81,087,099,769 instructions # 3.11 insn per cycle + 9.069721762 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865557072044E-004 +Relative difference = 6.703789776019192e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.020230e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.033459e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.033459e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.344217 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,541,090,448 cycles # 2.786 GHz - 20,280,124,954 instructions # 3.10 insn per cycle - 2.348689069 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.570788e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.582625e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.582625e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060120e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.503847 sec + 6,867,586,123 cycles # 2.739 GHz + 21,073,651,457 instructions # 3.07 insn per cycle + 2.508233409 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +Avg ME (F77/C++) = 6.6274862240394555E-004 +Relative difference = 3.3804591304642774e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.604920e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.611581e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.611581e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.443366e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.449038e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.449038e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.029784 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,846,767,262 cycles # 2.755 GHz - 7,076,446,064 instructions # 2.49 insn per cycle - 1.034215836 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.143966 sec + 3,060,195,809 cycles # 2.667 GHz + 7,503,599,826 instructions # 2.45 insn per cycle + 1.148081805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15414) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.797566e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.806224e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.806224e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.597626e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.604737e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.604737e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.920078 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,539,792,408 cycles # 2.749 GHz - 6,413,266,409 instructions # 2.53 insn per cycle - 0.924434981 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 1.034413 sec + 2,756,912,293 cycles # 2.656 GHz + 6,940,381,664 instructions # 2.52 insn per cycle + 1.038660514 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15142) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.411104e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.416189e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.416189e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.323709e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.328328e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.328328e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.170311 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,078,956,436 cycles # 1.771 GHz - 3,314,205,136 instructions # 1.59 insn per cycle - 1.174679954 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.247183 sec + 2,125,047,261 cycles # 1.699 GHz + 3,566,175,366 instructions # 1.68 insn per cycle + 1.251460212 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2404) (512y: 5) (512z:14466) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 +Avg ME (F77/C++) = 6.6271953539095291E-004 +Relative difference = 5.340261281526277e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 3ebd5caeb8..95ef8247c5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:51:07 +DATE: 2025-09-24_09:16:35 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.481675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.521755e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.525865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.240709e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247631e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.248323e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159396e-01 +- 3.238803e-01 ) GeV^-4 -TOTAL : 0.477918 sec -INFO: No Floating Point Exceptions have been reported - 1,990,228,523 cycles # 2.864 GHz - 2,978,927,673 instructions # 1.50 insn per cycle - 0.751663902 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.554830 sec + 2,312,115,314 cycles # 2.824 GHz + 3,412,833,583 instructions # 1.48 insn per cycle + 0.876321370 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.037728e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.099183e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.101846e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.886731 sec -INFO: No Floating Point Exceptions have been reported - 6,136,710,401 cycles # 2.909 GHz - 13,142,850,218 instructions # 2.14 insn per cycle - 2.175693489 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +Avg ME (F77/GPU) = 6.6262666367186696E-004 +Relative difference = 2.827504444018108e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.941292e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.942240e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.942240e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.455097 sec -INFO: No Floating Point Exceptions have been reported - 24,914,950,228 cycles # 2.946 GHz - 79,111,045,664 instructions # 3.18 insn per cycle - 8.459383915 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.807718e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.808561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.808561e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 9.079227 sec + 26,084,123,419 cycles # 2.872 GHz + 81,083,696,494 instructions # 3.11 insn per cycle + 9.083053146 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865557072044E-004 +Relative difference = 6.703789776019192e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.977213e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.990041e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.990041e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.589773e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.602019e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.602019e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.356205 sec -INFO: No Floating Point Exceptions have been reported - 6,550,546,250 cycles # 2.776 GHz - 20,269,237,886 instructions # 3.09 insn per cycle - 2.360272003 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.494806 sec + 6,814,877,087 cycles # 2.729 GHz + 21,062,900,930 instructions # 3.09 insn per cycle + 2.498709374 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +Avg ME (F77/C++) = 6.6274862240394555E-004 +Relative difference = 3.3804591304642774e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.601317e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.608084e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.608084e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.441516e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.447328e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.447328e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.030095 sec -INFO: No Floating Point Exceptions have been reported - 2,839,431,727 cycles # 2.748 GHz - 7,063,774,184 instructions # 2.49 insn per cycle - 1.034210988 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.144362 sec + 3,058,910,732 cycles # 2.667 GHz + 7,493,335,738 instructions # 2.45 insn per cycle + 1.148273072 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15414) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.801735e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.810193e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.810193e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.552103e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558635e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558635e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.916264 sec -INFO: No Floating Point Exceptions have been reported - 2,529,614,240 cycles # 2.751 GHz - 6,399,972,746 instructions # 2.53 insn per cycle - 0.920311559 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 1.063343 sec + 2,755,997,502 cycles # 2.585 GHz + 6,929,610,844 instructions # 2.51 insn per cycle + 1.067272546 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15142) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.413582e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.418711e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.418711e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.318308e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.322938e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322938e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.166574 sec -INFO: No Floating Point Exceptions have been reported - 2,070,023,042 cycles # 1.769 GHz - 3,300,470,940 instructions # 1.59 insn per cycle - 1.170621524 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.250811 sec + 2,123,388,726 cycles # 1.694 GHz + 3,554,459,202 instructions # 1.67 insn per cycle + 1.254752290 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2404) (512y: 5) (512z:14466) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 +Avg ME (F77/C++) = 6.6271953539095291E-004 +Relative difference = 5.340261281526277e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 8aa78a916d..78f451b7bd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:48:16 +DATE: 2025-09-24_09:12:30 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.460370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.501314e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.505347e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.253486e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.260232e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.260954e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.475676 sec -INFO: No Floating Point Exceptions have been reported - 1,998,344,168 cycles # 2.886 GHz - 3,027,104,836 instructions # 1.51 insn per cycle - 0.748859673 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.556466 sec + 2,306,511,731 cycles # 2.823 GHz + 3,405,943,740 instructions # 1.48 insn per cycle + 0.875455709 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.172168e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.234506e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.237328e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.821851 sec -INFO: No Floating Point Exceptions have been reported - 6,001,499,639 cycles # 2.924 GHz - 13,042,334,044 instructions # 2.17 insn per cycle - 2.109220847 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +Avg ME (F77/GPU) = 6.6262666367186696E-004 +Relative difference = 2.827504444018108e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.941510e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.942442e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.942442e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.815805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816651e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.452200 sec -INFO: No Floating Point Exceptions have been reported - 24,907,540,526 cycles # 2.946 GHz - 79,109,866,227 instructions # 3.18 insn per cycle - 8.456266423 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.037183 sec + 25,981,381,778 cycles # 2.874 GHz + 81,082,014,607 instructions # 3.12 insn per cycle + 9.041093443 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865557072044E-004 +Relative difference = 6.703789776019192e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.017369e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.030395e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.030395e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.341887 sec -INFO: No Floating Point Exceptions have been reported - 6,533,658,672 cycles # 2.786 GHz - 20,270,788,705 instructions # 3.10 insn per cycle - 2.345994128 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.589046e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.600832e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.600832e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060120e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.493906 sec + 6,799,938,748 cycles # 2.723 GHz + 21,064,168,821 instructions # 3.10 insn per cycle + 2.497884091 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +Avg ME (F77/C++) = 6.6274862240394555E-004 +Relative difference = 3.3804591304642774e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.604029e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.610893e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.610893e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.442997e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.448430e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.448430e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.027451 sec -INFO: No Floating Point Exceptions have been reported - 2,836,206,155 cycles # 2.751 GHz - 7,065,988,768 instructions # 2.49 insn per cycle - 1.031531216 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.141752 sec + 3,049,041,388 cycles # 2.663 GHz + 7,493,851,873 instructions # 2.46 insn per cycle + 1.145791302 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15414) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.796598e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.804847e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.804847e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.606353e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.613147e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.613147e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.917896 sec -INFO: No Floating Point Exceptions have been reported - 2,527,698,465 cycles # 2.744 GHz - 6,403,574,368 instructions # 2.53 insn per cycle - 0.921906155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 1.026037 sec + 2,741,511,931 cycles # 2.664 GHz + 6,930,338,040 instructions # 2.53 insn per cycle + 1.030062983 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15142) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.414079e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419125e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419125e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.320960e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.325425e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325425e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.164994 sec -INFO: No Floating Point Exceptions have been reported - 2,068,678,617 cycles # 1.770 GHz - 3,304,093,166 instructions # 1.60 insn per cycle - 1.169236265 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.246929 sec + 2,118,661,680 cycles # 1.695 GHz + 3,556,802,211 instructions # 1.68 insn per cycle + 1.251020606 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2404) (512y: 5) (512z:14466) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 +Avg ME (F77/C++) = 6.6271953539095291E-004 +Relative difference = 5.340261281526277e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..014c2362f3 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_09:23:46 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.277839e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.284276e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.284966e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.572758 sec + 2,262,778,540 cycles # 2.823 GHz + 3,427,514,498 instructions # 1.51 insn per cycle + 0.858446661 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626454e-04 +Avg ME (F77/GPU) = 6.6262666367186696E-004 +Relative difference = 2.827504444018108e-05 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.805032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.805867e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.805867e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 9.091007 sec + 26,124,244,226 cycles # 2.873 GHz + 81,083,408,377 instructions # 3.10 insn per cycle + 9.094976493 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865557072044E-004 +Relative difference = 6.703789776019192e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.666139e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.678572e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.678572e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060120e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.465205 sec + 6,801,628,597 cycles # 2.755 GHz + 21,064,284,503 instructions # 3.10 insn per cycle + 2.469351630 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274862240394555E-004 +Relative difference = 3.3804591304642774e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.445029e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.450720e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.450720e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.139925 sec + 3,050,112,240 cycles # 2.668 GHz + 7,494,303,135 instructions # 2.46 insn per cycle + 1.144054256 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15414) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.602494e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.609389e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.609389e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.028625 sec + 2,754,615,458 cycles # 2.670 GHz + 6,930,577,140 instructions # 2.52 insn per cycle + 1.032612157 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15142) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 1.317313e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.321878e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.321878e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.250158 sec + 2,118,002,606 cycles # 1.690 GHz + 3,556,013,914 instructions # 1.68 insn per cycle + 1.254237663 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2404) (512y: 5) (512z:14466) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271953539095291E-004 +Relative difference = 5.340261281526277e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 59696ff16e..6feb1df6d4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,254 +10,219 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:45:30 +DATE: 2025-09-24_09:08:31 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.026958e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.513975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.517845e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.187538e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.276791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.277504e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.478506 sec -INFO: No Floating Point Exceptions have been reported - 1,992,355,788 cycles # 2.865 GHz - 3,027,729,409 instructions # 1.52 insn per cycle - 0.751914958 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.555785 sec + 2,284,015,691 cycles # 2.828 GHz + 3,386,185,673 instructions # 1.48 insn per cycle + 0.866642253 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP= +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.156008e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.226322e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.229025e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641709e+00 +- 4.994248e+00 ) GeV^-4 -TOTAL : 1.900625 sec -INFO: No Floating Point Exceptions have been reported - 6,225,372,770 cycles # 2.919 GHz - 12,616,761,411 instructions # 2.03 insn per cycle - 2.188103626 seconds time elapsed +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +Avg ME (F77/GPU) = 6.6262666367186696E-004 +Relative difference = 2.827504444018108e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.942577e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.943527e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.943527e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.811669e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812527e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.812527e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.447888 sec -INFO: No Floating Point Exceptions have been reported - 24,912,816,300 cycles # 2.948 GHz - 79,110,249,403 instructions # 3.18 insn per cycle - 8.452014602 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.057855 sec + 26,043,974,250 cycles # 2.875 GHz + 81,082,670,791 instructions # 3.11 insn per cycle + 9.061675603 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863312764526E-004 -Relative difference = 4.998523613136231e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274865557072044E-004 +Relative difference = 6.703789776019192e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.980733e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.993141e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.993141e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.354354 sec -INFO: No Floating Point Exceptions have been reported - 6,535,460,807 cycles # 2.772 GHz - 20,270,869,690 instructions # 3.10 insn per cycle - 2.358646539 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.616188e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.628081e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.628081e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060120e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.483899 sec + 6,798,798,697 cycles # 2.734 GHz + 21,064,458,791 instructions # 3.10 insn per cycle + 2.487889319 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861442972011E-004 -Relative difference = 2.1772539563413118e-08 +Avg ME (F77/C++) = 6.6274862240394555E-004 +Relative difference = 3.3804591304642774e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.603543e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.610156e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.610156e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.444931e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.450581e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.450581e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.027888 sec -INFO: No Floating Point Exceptions have been reported - 2,837,672,612 cycles # 2.752 GHz - 7,066,358,168 instructions # 2.49 insn per cycle - 1.031930682 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12055) (512y: 0) (512z: 0) +TOTAL : 1.140030 sec + 3,048,561,490 cycles # 2.667 GHz + 7,493,968,699 instructions # 2.46 insn per cycle + 1.144111649 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15414) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.798975e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.807399e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.807399e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.598657e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.605554e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.605554e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.916670 sec -INFO: No Floating Point Exceptions have been reported - 2,525,901,356 cycles # 2.745 GHz - 6,403,453,175 instructions # 2.54 insn per cycle - 0.920789172 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11019) (512y: 44) (512z: 0) +TOTAL : 1.030819 sec + 2,744,491,104 cycles # 2.654 GHz + 6,930,618,312 instructions # 2.53 insn per cycle + 1.035010575 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15142) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174396888E-004 -Relative difference = 2.7547150614455683e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.406582e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.411589e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.411589e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.321285e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.325896e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325896e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.171278 sec -INFO: No Floating Point Exceptions have been reported - 2,071,908,739 cycles # 1.764 GHz - 3,303,987,486 instructions # 1.59 insn per cycle - 1.175442581 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2603) (512y: 44) (512z: 9605) +TOTAL : 1.246497 sec + 2,116,511,021 cycles # 1.694 GHz + 3,556,085,472 instructions # 1.68 insn per cycle + 1.250375518 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2404) (512y: 5) (512z:14466) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779718007E-004 -Relative difference = 4.194411063934945e-08 +Avg ME (F77/C++) = 6.6271953539095291E-004 +Relative difference = 5.340261281526277e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index fc006f8d57..d1663b8a42 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:05:24 +DATE: 2025-09-24_07:56:03 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.473150e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.513248e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.516891e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.271341e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.278856e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.279619e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.500476 sec -INFO: No Floating Point Exceptions have been reported - 2,066,687,911 cycles # 2.859 GHz - 3,064,980,702 instructions # 1.48 insn per cycle - 0.941605450 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.570407 sec + 2,331,140,164 cycles # 2.830 GHz + 3,426,106,320 instructions # 1.47 insn per cycle + 0.880168125 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.096999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.159101e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.161763e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.803372 sec -INFO: No Floating Point Exceptions have been reported - 5,931,019,959 cycles # 2.909 GHz - 12,491,679,666 instructions # 2.11 insn per cycle - 2.096189929 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262667672387088E-004 -Relative difference = 2.825534762507892e-05 +Avg ME (F77/GPU) = 6.6262666367186696E-004 +Relative difference = 2.827504444018108e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.927739e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.928675e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.928675e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.820817e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.821684e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.821684e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.512686 sec -INFO: No Floating Point Exceptions have been reported - 24,976,995,918 cycles # 2.933 GHz - 78,849,322,260 instructions # 3.16 insn per cycle - 8.521021644 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.012115 sec + 25,906,896,614 cycles # 2.874 GHz + 81,026,549,335 instructions # 3.13 insn per cycle + 9.016202048 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5064) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866250177339E-004 -Relative difference = 5.65798569465384e-08 +Avg ME (F77/C++) = 6.6274865557083146E-004 +Relative difference = 6.703773024224362e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.196617e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.210064e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.210064e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.283841 sec -INFO: No Floating Point Exceptions have been reported - 6,462,353,077 cycles # 2.825 GHz - 20,230,287,596 instructions # 3.13 insn per cycle - 2.291660153 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13491) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.595850e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.607195e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.607195e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060120e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.491441 sec + 6,807,327,813 cycles # 2.729 GHz + 21,056,857,927 instructions # 3.09 insn per cycle + 2.495580061 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21429) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861448331612E-004 -Relative difference = 2.1853408865157068e-08 +Avg ME (F77/C++) = 6.6274862240394555E-004 +Relative difference = 3.3804591304642774e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.507603e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.513399e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.513399e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.441697e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.447219e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.447219e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.094262 sec -INFO: No Floating Point Exceptions have been reported - 2,977,852,840 cycles # 2.716 GHz - 7,207,139,157 instructions # 2.42 insn per cycle - 1.100869463 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12437) (512y: 0) (512z: 0) +TOTAL : 1.142349 sec + 3,055,307,583 cycles # 2.667 GHz + 7,491,997,935 instructions # 2.45 insn per cycle + 1.146425245 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15384) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668088170E-004 -Relative difference = 5.008331292535666e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.740158e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.747960e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.747960e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.613124e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.619942e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619942e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.947565 sec -INFO: No Floating Point Exceptions have been reported - 2,615,044,427 cycles # 2.750 GHz - 6,545,142,442 instructions # 2.50 insn per cycle - 0.954571468 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11449) (512y: 27) (512z: 0) +TOTAL : 1.021712 sec + 2,737,522,460 cycles # 2.670 GHz + 6,928,494,015 instructions # 2.53 insn per cycle + 1.025892768 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:15110) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668088170E-004 -Relative difference = 5.008331292535666e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271934460905568E-004 +Relative difference = 6.731214211985233e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.344321e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.349023e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.349023e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.322207e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.326725e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326725e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.225060 sec -INFO: No Floating Point Exceptions have been reported - 2,140,395,059 cycles # 1.742 GHz - 3,462,158,546 instructions # 1.62 insn per cycle - 1.232075146 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3037) (512y: 25) (512z: 9677) +TOTAL : 1.245540 sec + 2,116,694,696 cycles # 1.695 GHz + 3,553,801,233 instructions # 1.68 insn per cycle + 1.249553015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2371) (512y: 5) (512z:14569) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952032316561E-004 -Relative difference = 3.066631594207157e-08 +Avg ME (F77/C++) = 6.6271953539095291E-004 +Relative difference = 5.340261281526277e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 507fa267fb..1756d90979 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:32:18 +DATE: 2025-09-24_08:49:50 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.570913e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.612300e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.616113e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059597e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.500062 sec -INFO: No Floating Point Exceptions have been reported - 2,077,093,809 cycles # 2.883 GHz - 3,095,482,027 instructions # 1.49 insn per cycle - 0.782648151 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.273145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.279956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.280681e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.571530 sec + 2,325,341,957 cycles # 2.824 GHz + 3,397,614,092 instructions # 1.46 insn per cycle + 0.880946906 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.624378e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.693284e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.696098e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.736663 sec -INFO: No Floating Point Exceptions have been reported - 5,745,039,966 cycles # 2.917 GHz - 12,243,347,327 instructions # 2.13 insn per cycle - 2.029186282 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262669162351490E-004 -Relative difference = 2.8232862531213374e-05 +Avg ME (F77/GPU) = 6.6262666367365719E-004 +Relative difference = 2.827504173853307e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.610943e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.611718e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.611718e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.865260e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.866153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.866153e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.233986 sec -INFO: No Floating Point Exceptions have been reported - 86,131,386,822 cycles # 2.946 GHz - 135,652,659,903 instructions # 1.57 insn per cycle - 29.237672033 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15856) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.797861 sec + 25,249,208,808 cycles # 2.869 GHz + 74,028,397,772 instructions # 2.93 insn per cycle + 8.801741537 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13828) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349717465765E-004 -Relative difference = 4.26303654465793e-09 +Avg ME (F77/C++) = 6.6275353240358159E-004 +Relative difference = 4.8892358250989e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.849906e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.862163e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.862163e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.399244 sec -INFO: No Floating Point Exceptions have been reported - 6,757,771,203 cycles # 2.813 GHz - 19,352,943,673 instructions # 2.86 insn per cycle - 2.403059869 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:69577) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.122488e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.136140e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.136140e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.307551 sec + 6,280,992,865 cycles # 2.718 GHz + 19,272,778,154 instructions # 3.07 insn per cycle + 2.311503285 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67926) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862748188362E-004 -Relative difference = 4.14665283800746e-08 +Avg ME (F77/C++) = 6.6274861529819207E-004 +Relative difference = 2.308294891171356e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.430057e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.435326e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.435326e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.151867 sec -INFO: No Floating Point Exceptions have been reported - 3,169,480,733 cycles # 2.744 GHz - 6,794,963,559 instructions # 2.14 insn per cycle - 1.155607574 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:49034) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.512821e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.519072e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.519072e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060955e+00 +- 2.367411e+00 ) GeV^-4 +TOTAL : 1.089282 sec + 2,902,145,708 cycles # 2.657 GHz + 6,574,950,454 instructions # 2.27 insn per cycle + 1.093366801 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45256) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731568543797E-004 -Relative difference = 2.3668012430631962e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627271e-04 +Avg ME (F77/C++) = 6.6272713594379973E-004 +Relative difference = 5.423620023149683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.731154e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.739005e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.739005e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.952402 sec -INFO: No Floating Point Exceptions have been reported - 2,622,407,179 cycles # 2.744 GHz - 5,970,044,618 instructions # 2.28 insn per cycle - 0.956238068 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42602) (512y: 11) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.730871e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.738738e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.738738e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060955e+00 +- 2.367411e+00 ) GeV^-4 +TOTAL : 0.952726 sec + 2,511,239,490 cycles # 2.626 GHz + 5,909,367,922 instructions # 2.35 insn per cycle + 0.956999585 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40460) (512y: 5) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731568543797E-004 -Relative difference = 2.3668012430631962e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627271e-04 +Avg ME (F77/C++) = 6.6272713594379973E-004 +Relative difference = 5.423620023149683e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.414435e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419474e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419474e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.165045 sec -INFO: No Floating Point Exceptions have been reported - 2,067,228,248 cycles # 1.769 GHz - 3,495,098,954 instructions # 1.69 insn per cycle - 1.168981438 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5208) (512y: 3) (512z:44858) +EvtsPerSec[Rmb+ME] (23) = ( 1.413637e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.418865e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.418865e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060957e+00 +- 2.367412e+00 ) GeV^-4 +TOTAL : 1.165430 sec + 1,976,421,523 cycles # 1.691 GHz + 3,061,294,918 instructions # 1.55 insn per cycle + 1.169355115 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2273) (512y: 8) (512z:39431) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750237027223E-004 -Relative difference = 3.5765412974815996e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731685135949E-004 +Relative difference = 2.5427290380992266e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 2595c32afa..68c286c55a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:33:09 +DATE: 2025-09-24_08:50:37 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.573938e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.613715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.617455e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059597e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.493227 sec -INFO: No Floating Point Exceptions have been reported - 2,049,677,908 cycles # 2.879 GHz - 3,032,655,926 instructions # 1.48 insn per cycle - 0.769218706 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.266253e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.272891e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.273577e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.573598 sec + 2,364,434,436 cycles # 2.823 GHz + 3,420,738,261 instructions # 1.45 insn per cycle + 0.896007812 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.673337e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.742674e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.745488e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.731870 sec -INFO: No Floating Point Exceptions have been reported - 5,773,880,906 cycles # 2.919 GHz - 12,286,627,464 instructions # 2.13 insn per cycle - 2.034768323 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 90 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262669162351490E-004 -Relative difference = 2.8232862531213374e-05 +Avg ME (F77/GPU) = 6.6262666367365719E-004 +Relative difference = 2.827504173853307e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.600277e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.601076e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.601076e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.289301 sec -INFO: No Floating Point Exceptions have been reported - 86,207,606,672 cycles # 2.943 GHz - 135,355,986,373 instructions # 1.57 insn per cycle - 29.293063672 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15471) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.866837e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.867760e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.867760e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367800e+00 ) GeV^-4 +TOTAL : 8.790005 sec + 25,260,397,225 cycles # 2.873 GHz + 73,871,409,822 instructions # 2.92 insn per cycle + 8.793865449 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13598) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275349662128086E-004 -Relative difference = 5.098002770919431e-09 +Avg ME (F77/C++) = 6.6275351788769310E-004 +Relative difference = 2.698996393512682e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.848001e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.860244e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.860244e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.399823 sec -INFO: No Floating Point Exceptions have been reported - 6,855,955,670 cycles # 2.853 GHz - 19,471,788,292 instructions # 2.84 insn per cycle - 2.403723205 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:69876) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.194093e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.207887e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.207887e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.284649 sec + 6,280,560,769 cycles # 2.746 GHz + 19,274,584,783 instructions # 3.07 insn per cycle + 2.288743756 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:67594) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862799683282E-004 -Relative difference = 4.2243518621014775e-08 +Avg ME (F77/C++) = 6.6274857166543494E-004 +Relative difference = 4.275311189565278e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.455129e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.460639e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.460639e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.132031 sec -INFO: No Floating Point Exceptions have been reported - 3,102,391,764 cycles # 2.733 GHz - 6,715,014,781 instructions # 2.16 insn per cycle - 1.135898458 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47692) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.508483e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.514409e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.514409e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060954e+00 +- 2.367410e+00 ) GeV^-4 +TOTAL : 1.092368 sec + 2,897,281,270 cycles # 2.643 GHz + 6,533,267,590 instructions # 2.25 insn per cycle + 1.098506205 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:44412) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731623419345E-004 -Relative difference = 2.449603850635964e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272715143205624E-004 +Relative difference = 7.328497113229569e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.738588e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.746518e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.746518e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 0.948137 sec -INFO: No Floating Point Exceptions have been reported - 2,626,199,962 cycles # 2.761 GHz - 5,966,019,567 instructions # 2.27 insn per cycle - 0.951931849 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41858) (512y: 13) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.760309e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.768467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.768467e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060954e+00 +- 2.367410e+00 ) GeV^-4 +TOTAL : 0.936509 sec + 2,500,675,539 cycles # 2.661 GHz + 5,890,044,790 instructions # 2.36 insn per cycle + 0.940573254 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:39822) (512y: 3) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731623419345E-004 -Relative difference = 2.449603850635964e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272715143205624E-004 +Relative difference = 7.328497113229569e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.414552e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419616e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419616e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.164736 sec -INFO: No Floating Point Exceptions have been reported - 2,067,746,434 cycles # 1.771 GHz - 3,487,891,958 instructions # 1.69 insn per cycle - 1.168545250 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4171) (512y: 4) (512z:44494) +EvtsPerSec[Rmb+ME] (23) = ( 1.418341e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423496e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423496e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060957e+00 +- 2.367412e+00 ) GeV^-4 +TOTAL : 1.161440 sec + 1,967,033,501 cycles # 1.689 GHz + 3,051,207,349 instructions # 1.55 insn per cycle + 1.165567150 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1679) (512y: 8) (512z:39086) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750247886592E-004 -Relative difference = 3.740400032174438e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272730894095799E-004 +Relative difference = 1.3491156920820374e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..f18d4f805a --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:19:42 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.822375e+03 1 256 +9.453850e+03 2 256 +1.857855e+04 4 256 +3.676283e+04 8 256 +7.275826e+04 16 256 +1.153304e+05 32 256 +1.042562e+05 64 256 +1.034053e+05 128 256 +1.073056e+05 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +6.199679e+02 1 32 +1.236165e+03 2 32 +2.466449e+03 4 32 +4.895667e+03 8 32 +9.549456e+03 16 32 +1.878846e+04 32 32 +3.717465e+04 64 32 +7.269473e+04 128 32 +1.164270e+05 256 32 +1.038789e+05 512 32 +1.011077e+05 1024 32 +1.035741e+05 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.757500e+03 1 256 +1.764699e+03 2 256 +1.759218e+03 4 256 +### CPU: scaling test 32 +1.684513e+03 1 32 +1.737222e+03 2 32 +1.742163e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.397695e+03 1 256 +3.406207e+03 2 256 +3.387011e+03 4 256 +### CPU: scaling test 32 +3.380204e+03 1 32 +3.388767e+03 2 32 +3.400425e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +7.323452e+03 1 256 +7.259704e+03 2 256 +7.409045e+03 4 256 +### CPU: scaling test 32 +6.823037e+03 1 32 +7.085153e+03 2 32 +7.085134e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.310815e+03 1 256 +8.162898e+03 2 256 +8.171326e+03 4 256 +### CPU: scaling test 32 +7.804838e+03 1 32 +8.373489e+03 2 32 +8.009578e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.602547e+03 1 256 +6.595105e+03 2 256 +6.634294e+03 4 256 +### CPU: scaling test 32 +6.535754e+03 1 32 +6.615078e+03 2 32 +6.578849e+03 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index a3a2deda6e..cb75ca85a9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:03:48 +DATE: 2025-09-24_07:53:31 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.318725e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.347238e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.349358e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.042537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045875e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.539210 sec -INFO: No Floating Point Exceptions have been reported - 2,220,963,802 cycles # 2.880 GHz - 3,406,426,816 instructions # 1.53 insn per cycle - 0.832307462 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.657183 sec + 2,703,266,341 cycles # 2.831 GHz + 4,321,160,228 instructions # 1.60 insn per cycle + 1.011294033 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165985e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.047029 sec -INFO: No Floating Point Exceptions have been reported - 9,687,290,131 cycles # 2.924 GHz - 21,862,744,253 instructions # 2.26 insn per cycle - 3.379254641 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 92 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266732376103494E-004 -Relative difference = 2.659538381540814e-07 +Avg ME (F77/GPU) = 6.6266730799887004E-004 +Relative difference = 2.8973977137440954e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.868179e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.869079e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.869079e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.757037e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.757859e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.757859e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.786228 sec -INFO: No Floating Point Exceptions have been reported - 25,910,148,307 cycles # 2.949 GHz - 79,427,985,275 instructions # 3.07 insn per cycle - 8.790193498 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4775) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.340689 sec + 26,866,907,464 cycles # 2.876 GHz + 81,209,946,669 instructions # 3.02 insn per cycle + 9.344779590 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.521065e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.524381e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.524381e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.414499e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.417525e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.417525e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.666859 sec -INFO: No Floating Point Exceptions have been reported - 12,831,991,791 cycles # 2.749 GHz - 38,825,085,312 instructions # 3.03 insn per cycle - 4.671138327 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13173) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.809619 sec + 13,803,444,492 cycles # 2.868 GHz + 40,349,187,189 instructions # 2.92 insn per cycle + 4.813848928 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.087173e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.104021e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.104021e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.393676e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.407811e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.407811e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.035173 sec -INFO: No Floating Point Exceptions have been reported - 5,594,158,972 cycles # 2.744 GHz - 13,617,938,147 instructions # 2.43 insn per cycle - 2.039272194 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11427) (512y: 0) (512z: 0) +TOTAL : 2.224565 sec + 5,935,126,335 cycles # 2.664 GHz + 14,390,194,787 instructions # 2.42 insn per cycle + 2.228759492 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14913) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.329915e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.351715e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.351715e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.270248e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.299986e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.299986e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.764965 sec -INFO: No Floating Point Exceptions have been reported - 4,865,961,098 cycles # 2.752 GHz - 12,296,280,016 instructions # 2.53 insn per cycle - 1.768959352 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10331) (512y: 80) (512z: 0) +TOTAL : 1.993562 sec + 5,247,295,678 cycles # 2.641 GHz + 13,270,608,923 instructions # 2.53 insn per cycle + 1.998898042 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14597) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.944494e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.956947e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.956947e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.626375e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.637949e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.637949e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.368908 sec -INFO: No Floating Point Exceptions have been reported - 4,175,656,001 cycles # 1.761 GHz - 6,394,856,033 instructions # 1.53 insn per cycle - 2.373043514 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1983) (512y: 92) (512z: 9360) +TOTAL : 2.481586 sec + 4,197,558,675 cycles # 1.689 GHz + 6,762,650,295 instructions # 1.61 insn per cycle + 2.485840885 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1800) (512y: 61) (512z:14464) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling new file mode 100644 index 0000000000..1c869da30d --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:39:30 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +4.331545e+03 1 256 +8.426586e+03 2 256 +1.654696e+04 4 256 +3.201027e+04 8 256 +6.406175e+04 16 256 +1.019414e+05 32 256 +9.772810e+04 64 256 +9.873787e+04 128 256 +1.039265e+05 256 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +5.589995e+02 1 32 +1.112129e+03 2 32 +2.206817e+03 4 32 +4.351940e+03 8 32 +8.488479e+03 16 32 +1.664990e+04 32 32 +3.291208e+04 64 32 +6.376795e+04 128 32 +1.034208e+05 256 32 +9.712943e+04 512 32 +9.650168e+04 1024 32 +9.998255e+04 2048 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.766381e+03 1 256 +1.762261e+03 2 256 +1.766249e+03 4 256 +### CPU: scaling test 32 +1.745638e+03 1 32 +1.768353e+03 2 32 +1.766116e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.362029e+03 1 256 +3.385990e+03 2 256 +3.394280e+03 4 256 +### CPU: scaling test 32 +3.309168e+03 1 32 +3.404285e+03 2 32 +3.428103e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.836698e+03 1 256 +7.294631e+03 2 256 +7.375209e+03 4 256 +### CPU: scaling test 32 +7.320256e+03 1 32 +7.135601e+03 2 32 +7.115901e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +8.133219e+03 1 256 +8.229915e+03 2 256 +8.293007e+03 4 256 +### CPU: scaling test 32 +7.769034e+03 1 32 +8.276380e+03 2 32 +8.151187e+03 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.663933e+03 1 256 +6.694036e+03 2 256 +6.647690e+03 4 256 +### CPU: scaling test 32 +6.567727e+03 1 32 +6.528060e+03 2 32 +6.592152e+03 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt new file mode 100644 index 0000000000..821cf30e48 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_08:34:20 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM=1 +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.776170e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.799139e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.800905e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.375389 sec + 4,991,020,745 cycles # 2.835 GHz + 7,054,085,160 instructions # 1.41 insn per cycle + 1.816386659 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 92 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626675e-04 +Avg ME (F77/GPU) = 6.6266733778757203E-004 +Relative difference = 2.447870582934832e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.752679e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.753471e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.753471e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.363928 sec + 26,884,271,281 cycles # 2.870 GHz + 81,216,096,709 instructions # 3.02 insn per cycle + 9.368036247 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731406016235E-004 +Relative difference = 2.8059296349552523e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.414881e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.417986e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.417986e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.808799 sec + 13,800,804,174 cycles # 2.868 GHz + 40,351,723,491 instructions # 2.92 insn per cycle + 4.812876561 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20856) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730246908442E-004 +Relative difference = 2.98084507782618e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.409620e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.424305e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.424305e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.219807 sec + 5,939,784,204 cycles # 2.672 GHz + 14,390,069,622 instructions # 2.42 insn per cycle + 2.223804960 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14913) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.210020e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.227626e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.227626e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.003983 sec + 5,246,682,786 cycles # 2.615 GHz + 13,269,929,931 instructions # 2.53 insn per cycle + 2.008017898 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14597) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.600241e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.612011e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.612011e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.491377 sec + 4,210,144,393 cycles # 1.688 GHz + 6,762,278,416 instructions # 1.61 insn per cycle + 2.495546734 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1800) (512y: 61) (512z:14464) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt new file mode 100644 index 0000000000..9c6d8ca930 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt @@ -0,0 +1,225 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasNoBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasNoBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + +DATE: 2025-09-24_09:22:53 + +HASBLAS=hasNoBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.039625e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.042053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.042252e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.660363 sec + 2,619,828,821 cycles # 2.834 GHz + 4,281,538,007 instructions # 1.63 insn per cycle + 0.986350069 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 92 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626675e-04 +Avg ME (F77/GPU) = 6.6266730799887004E-004 +Relative difference = 2.8973977137440954e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +EvtsPerSec[Rmb+ME] (23) = ( 1.756398e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.757230e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.757230e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.343945 sec + 26,858,148,803 cycles # 2.874 GHz + 81,209,073,817 instructions # 3.02 insn per cycle + 9.347982497 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731406016235E-004 +Relative difference = 2.8059296349552523e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 3.417609e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.420711e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.420711e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.805076 sec + 13,803,294,859 cycles # 2.871 GHz + 40,349,153,644 instructions # 2.92 insn per cycle + 4.809203637 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20856) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730246908442E-004 +Relative difference = 2.98084507782618e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.288092e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.302009e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.302009e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.256619 sec + 5,956,235,867 cycles # 2.636 GHz + 14,389,746,485 instructions # 2.42 insn per cycle + 2.260701525 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14913) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 8.402558e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.421025e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.421025e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.958309 sec + 5,241,735,696 cycles # 2.672 GHz + 13,269,342,480 instructions # 2.53 insn per cycle + 1.962606340 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14597) (512y: 45) (512z: 0) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 6.639185e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.650544e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.650544e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.476926 sec + 4,196,346,010 cycles # 1.692 GHz + 6,762,580,177 instructions # 1.61 insn per cycle + 2.481118279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1800) (512y: 61) (512z:14464) +------------------------------------------------------------------------- +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 4 tests. +DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } +DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index f598011718..1a772ed057 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-10-06_09:04:22 +DATE: 2025-09-24_07:54:24 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.335025e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.357927e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.359916e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.040067e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.043073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.043284e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.537353 sec -INFO: No Floating Point Exceptions have been reported - 2,216,980,042 cycles # 2.869 GHz - 3,463,326,813 instructions # 1.56 insn per cycle - 0.836472238 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.659649 sec + 2,692,289,857 cycles # 2.834 GHz + 4,308,067,581 instructions # 1.60 insn per cycle + 1.010161880 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.141323e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.172030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.173253e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.034442 sec -INFO: No Floating Point Exceptions have been reported - 9,665,974,027 cycles # 2.922 GHz - 21,248,987,108 instructions # 2.20 insn per cycle - 3.363171619 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 106 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 92 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266732376103494E-004 -Relative difference = 2.659538381540814e-07 +Avg ME (F77/GPU) = 6.6266730799887004E-004 +Relative difference = 2.8973977137440954e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.862251e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.863154e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.863154e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.754147e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.754970e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.754970e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.813876 sec -INFO: No Floating Point Exceptions have been reported - 25,987,730,158 cycles # 2.948 GHz - 79,453,128,863 instructions # 3.06 insn per cycle - 8.817767368 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.355875 sec + 26,882,377,088 cycles # 2.872 GHz + 81,144,358,037 instructions # 3.02 insn per cycle + 9.359891416 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 6108) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.512571e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.515785e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515785e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.398478e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.401707e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.401707e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.675994 sec -INFO: No Floating Point Exceptions have been reported - 12,822,983,844 cycles # 2.741 GHz - 38,780,874,555 instructions # 3.02 insn per cycle - 4.681038643 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12935) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.832125 sec + 13,873,927,472 cycles # 2.870 GHz + 40,345,879,506 instructions # 2.91 insn per cycle + 4.836479951 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20851) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.056370e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.072927e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.072927e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.370397e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.384723e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.384723e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.042531 sec -INFO: No Floating Point Exceptions have been reported - 5,590,175,615 cycles # 2.733 GHz - 13,732,675,080 instructions # 2.46 insn per cycle - 2.046647326 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11510) (512y: 0) (512z: 0) +TOTAL : 2.231393 sec + 5,956,225,034 cycles # 2.665 GHz + 14,386,792,933 instructions # 2.42 insn per cycle + 2.235577955 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14889) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.148791e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.170046e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.170046e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.240910e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.259069e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.259069e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.800883 sec -INFO: No Floating Point Exceptions have been reported - 4,955,825,709 cycles # 2.749 GHz - 12,423,990,964 instructions # 2.51 insn per cycle - 1.804980058 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10322) (512y: 240) (512z: 0) +TOTAL : 1.996485 sec + 5,327,672,851 cycles # 2.665 GHz + 13,266,285,040 instructions # 2.49 insn per cycle + 2.000686077 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:14571) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.851374e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.863307e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.863307e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.622398e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.633506e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.633506e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.400794 sec -INFO: No Floating Point Exceptions have been reported - 4,218,682,996 cycles # 1.755 GHz - 6,496,899,309 instructions # 1.54 insn per cycle - 2.406253121 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1806) (512y: 190) (512z: 9358) +TOTAL : 2.483048 sec + 4,190,553,955 cycles # 1.686 GHz + 6,758,161,428 instructions # 1.61 insn per cycle + 2.487181679 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1773) (512y: 61) (512z:14567) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..601539335a --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-09-24_08:21:57 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.619045e+02 1 256 +5.125049e+02 2 256 +9.895442e+02 4 256 +1.869472e+03 8 256 +3.332118e+03 16 256 +4.504461e+03 32 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +3.378133e+01 1 32 +6.741007e+01 2 32 +1.342433e+02 4 32 +2.647447e+02 8 32 +5.193102e+02 16 32 +1.007228e+03 32 32 +1.897741e+03 64 32 +3.318151e+03 128 32 +4.525995e+03 256 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.167657e+01 1 256 +5.126698e+01 2 256 +5.197015e+01 4 256 +### CPU: scaling test 32 +5.039014e+01 1 32 +5.180681e+01 2 32 +5.118270e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.706331e+01 1 256 +9.963066e+01 2 256 +9.765425e+01 4 256 +### CPU: scaling test 32 +9.726834e+01 1 32 +9.960643e+01 2 32 +9.770466e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.101095e+02 1 256 +2.150546e+02 2 256 +2.136656e+02 4 256 +### CPU: scaling test 32 +2.098436e+02 1 32 +2.136973e+02 2 32 +2.141888e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.319430e+02 1 256 +2.339110e+02 2 256 +2.313243e+02 4 256 +### CPU: scaling test 32 +2.306710e+02 1 32 +2.300977e+02 2 32 +2.283924e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.509438e+02 1 256 +2.527744e+02 2 256 +2.493814e+02 4 256 +### CPU: scaling test 32 +2.520183e+02 1 32 +2.508423e+02 2 32 +2.486287e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 17692fc5fb..53a1d8e8ed 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,218 +25,181 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:07:10 +DATE: 2025-09-24_07:59:06 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059500e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.059934e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060148e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.622335e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.622504e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.622543e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.453264 sec -INFO: No Floating Point Exceptions have been reported - 8,089,923,192 cycles # 2.904 GHz - 15,932,007,883 instructions # 1.97 insn per cycle - 2.843483231 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 3.969322 sec + 12,105,688,469 cycles # 2.856 GHz + 20,287,798,096 instructions # 1.68 insn per cycle + 4.295173144 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.246459e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.248360e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.248591e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.019480 sec -INFO: No Floating Point Exceptions have been reported - 12,563,980,059 cycles # 2.886 GHz - 29,860,686,581 instructions # 2.38 insn per cycle - 4.410635015 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 114 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.535286e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.535490e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.535490e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.045437e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.045534e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.045534e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.007645 sec -INFO: No Floating Point Exceptions have been reported - 18,987,096,753 cycles # 2.709 GHz - 53,904,905,030 instructions # 2.84 insn per cycle - 7.011475835 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32424) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 10.447866 sec + 27,795,495,084 cycles # 2.660 GHz + 55,256,291,469 instructions # 1.99 insn per cycle + 10.452114933 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:82767) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.576045e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.576133e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.576133e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.621858e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.622184e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.622184e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.352060 sec -INFO: No Floating Point Exceptions have been reported - 9,813,557,960 cycles # 2.925 GHz - 27,153,109,398 instructions # 2.77 insn per cycle - 3.355902855 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96492) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.489538 sec + 15,617,128,762 cycles # 2.844 GHz + 29,177,592,109 instructions # 1.87 insn per cycle + 5.493747260 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:222579) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.392533e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.392946e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.392946e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.131961e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.132126e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.132126e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.558312 sec -INFO: No Floating Point Exceptions have been reported - 4,259,121,658 cycles # 2.728 GHz - 9,591,809,021 instructions # 2.25 insn per cycle - 1.562248696 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84961) (512y: 0) (512z: 0) +TOTAL : 2.480198 sec + 6,569,472,532 cycles # 2.645 GHz + 10,539,989,164 instructions # 1.60 insn per cycle + 2.484515280 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.852746e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.853256e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.853256e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.299322e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.299676e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.299676e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.371089 sec -INFO: No Floating Point Exceptions have been reported - 3,728,351,942 cycles # 2.713 GHz - 8,515,110,933 instructions # 2.28 insn per cycle - 1.374961080 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80609) (512y: 90) (512z: 0) +TOTAL : 2.298964 sec + 6,068,566,192 cycles # 2.636 GHz + 9,593,892,046 instructions # 1.58 insn per cycle + 2.303008730 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162127) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.432608e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.433087e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.433087e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.480717e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.481000e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.481000e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.541076 sec -INFO: No Floating Point Exceptions have been reported - 2,702,698,179 cycles # 1.750 GHz - 4,282,306,811 instructions # 1.58 insn per cycle - 1.545099546 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2856) (512y: 102) (512z:79114) +TOTAL : 2.131361 sec + 3,566,053,422 cycles # 1.670 GHz + 4,796,665,630 instructions # 1.35 insn per cycle + 2.135773673 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3115) (512y: 61) (512z:174170) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 1cf857b709..8dbb1e5f31 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,242 +25,189 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:40:38 +DATE: 2025-09-24_08:59:57 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.054825e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.057209e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.057209e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.621086e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.622365e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.622365e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.388056 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 7,931,671,790 cycles # 2.924 GHz - 17,623,602,431 instructions # 2.22 insn per cycle - 2.770306640 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +TOTAL : 3.951447 sec + 12,010,276,722 cycles # 2.850 GHz + 19,757,141,721 instructions # 1.65 insn per cycle + 4.269327289 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +==PROF== Profiling "diagram1": launch__registers_per_thread 114 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.226146e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.260909e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.260909e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.992337 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 12,629,951,963 cycles # 2.926 GHz - 29,269,734,483 instructions # 2.32 insn per cycle - 4.375813430 seconds time elapsed +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.889828e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.890068e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.890068e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.188685e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.188792e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.188792e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.696425 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 18,936,809,312 cycles # 2.827 GHz - 53,907,854,112 instructions # 2.85 insn per cycle - 6.700731218 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32424) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 10.175422 sec + 26,841,477,132 cycles # 2.637 GHz + 55,257,041,315 instructions # 2.06 insn per cycle + 10.180196300 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:82767) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.586455e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.586548e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.586548e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.844962e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.845309e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.845309e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.330534 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 9,805,857,457 cycles # 2.941 GHz - 27,153,288,385 instructions # 2.77 insn per cycle - 3.335034911 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96492) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.365862 sec + 15,274,413,237 cycles # 2.845 GHz + 29,178,711,293 instructions # 1.91 insn per cycle + 5.370266744 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:222579) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.386158e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.386550e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.386550e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.094443e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.094602e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.094602e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.562759 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,284,138,212 cycles # 2.735 GHz - 9,593,930,746 instructions # 2.24 insn per cycle - 1.567182963 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84961) (512y: 0) (512z: 0) +TOTAL : 2.523843 sec + 6,646,308,975 cycles # 2.630 GHz + 10,541,757,592 instructions # 1.59 insn per cycle + 2.528288062 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.892770e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.893321e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.893321e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.271706e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.271895e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.271895e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.359134 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 3,729,884,424 cycles # 2.737 GHz - 8,517,697,790 instructions # 2.28 insn per cycle - 1.363667603 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80609) (512y: 90) (512z: 0) +TOTAL : 2.326974 sec + 6,158,234,114 cycles # 2.643 GHz + 9,593,932,881 instructions # 1.56 insn per cycle + 2.331315094 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162127) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.423206e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.423718e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.423718e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.463912e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.464191e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.464191e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.547281 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,698,269,777 cycles # 1.739 GHz - 4,283,935,635 instructions # 1.59 insn per cycle - 1.552053679 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2856) (512y: 102) (512z:79114) +TOTAL : 2.146822 sec + 3,574,784,477 cycles # 1.662 GHz + 4,798,875,146 instructions # 1.34 insn per cycle + 2.151446066 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3115) (512y: 61) (512z:174170) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index bc67f5cacf..aa58e44f42 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,218 +25,181 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:08:38 +DATE: 2025-09-24_08:02:03 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.058591e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058974e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.059077e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.623198e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.623366e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.623412e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.451568 sec -INFO: No Floating Point Exceptions have been reported - 8,115,809,761 cycles # 2.919 GHz - 18,292,352,744 instructions # 2.25 insn per cycle - 2.835762935 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 3.971459 sec + 12,134,959,910 cycles # 2.856 GHz + 20,297,893,372 instructions # 1.67 insn per cycle + 4.304261275 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.228388e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.230439e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.230672e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.019291 sec -INFO: No Floating Point Exceptions have been reported - 12,725,284,497 cycles # 2.922 GHz - 29,505,773,730 instructions # 2.32 insn per cycle - 4.410068917 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 116 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/GPU) = 9.8722595284406675E-003 +Relative difference = 3.5164777636791134e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.905987e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.906203e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.906203e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.235365e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.235462e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.235462e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.685741 sec -INFO: No Floating Point Exceptions have been reported - 18,901,791,742 cycles # 2.826 GHz - 53,936,334,501 instructions # 2.85 insn per cycle - 6.689520607 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32022) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 10.087603 sec + 26,870,624,294 cycles # 2.663 GHz + 55,196,790,786 instructions # 2.05 insn per cycle + 10.091749369 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:82721) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.555988e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.556078e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.556078e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.605466e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.605795e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.605795e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.395185 sec -INFO: No Floating Point Exceptions have been reported - 9,954,308,036 cycles # 2.929 GHz - 27,130,330,125 instructions # 2.73 insn per cycle - 3.399134205 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96368) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.498721 sec + 15,622,067,797 cycles # 2.840 GHz + 29,174,932,066 instructions # 1.87 insn per cycle + 5.502837399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:222567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.364235e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.364649e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.364649e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.082981e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.083174e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.083174e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.571658 sec -INFO: No Floating Point Exceptions have been reported - 4,284,967,782 cycles # 2.721 GHz - 9,585,542,173 instructions # 2.24 insn per cycle - 1.575575323 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84968) (512y: 0) (512z: 0) +TOTAL : 2.537103 sec + 6,675,570,158 cycles # 2.628 GHz + 10,538,789,893 instructions # 1.58 insn per cycle + 2.541297219 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162326) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.898680e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.899276e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.899276e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.307808e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.308027e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.308027e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.358371 sec -INFO: No Floating Point Exceptions have been reported - 3,717,774,700 cycles # 2.731 GHz - 8,507,853,536 instructions # 2.29 insn per cycle - 1.362296235 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80632) (512y: 240) (512z: 0) +TOTAL : 2.290450 sec + 6,052,806,247 cycles # 2.639 GHz + 9,590,780,662 instructions # 1.58 insn per cycle + 2.294513704 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:161957) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.399522e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.400013e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.400013e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.481025e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.481302e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.481302e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.555521 sec -INFO: No Floating Point Exceptions have been reported - 2,693,302,897 cycles # 1.729 GHz - 4,281,674,096 instructions # 1.59 insn per cycle - 1.559394081 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2693) (512y: 184) (512z:79098) +TOTAL : 2.131058 sec + 3,545,703,154 cycles # 1.661 GHz + 4,795,627,357 instructions # 1.35 insn per cycle + 2.135278118 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3051) (512y: 61) (512z:174170) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 +Avg ME (F77/C++) = 9.8722595285411514E-003 +Relative difference = 3.5163759796632844e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..eadffd584b --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-09-24_08:28:23 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.664820e+02 1 256 +5.275400e+02 2 256 +1.038071e+03 4 256 +2.026475e+03 8 256 +3.877564e+03 16 256 +6.962465e+03 32 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +3.354429e+01 1 32 +6.708439e+01 2 32 +1.339141e+02 4 32 +2.671031e+02 8 32 +5.289090e+02 16 32 +1.039818e+03 32 32 +2.032513e+03 64 32 +3.851816e+03 128 32 +6.770722e+03 256 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.791936e+01 1 256 +5.716628e+01 2 256 +5.697536e+01 4 256 +### CPU: scaling test 32 +5.805917e+01 1 32 +5.723672e+01 2 32 +5.732940e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.191686e+02 1 256 +2.173201e+02 2 256 +2.184448e+02 4 256 +### CPU: scaling test 32 +2.223714e+02 1 32 +2.214415e+02 2 32 +2.186868e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.337721e+02 1 256 +4.299182e+02 2 256 +4.290540e+02 4 256 +### CPU: scaling test 32 +4.317507e+02 1 32 +4.343360e+02 2 32 +4.248740e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.636499e+02 1 256 +4.680596e+02 2 256 +4.719526e+02 4 256 +### CPU: scaling test 32 +4.621078e+02 1 32 +4.605198e+02 2 32 +4.721265e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.018132e+02 1 256 +5.093203e+02 2 256 +5.090828e+02 4 256 +### CPU: scaling test 32 +5.115033e+02 1 32 +5.033095e+02 2 32 +5.124353e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index e477be7c61..03a696ae52 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,218 +25,181 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:13:00 +DATE: 2025-09-24_08:10:49 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.207250e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.207995e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.208247e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.665002e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.665156e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.665198e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.762040 sec -INFO: No Floating Point Exceptions have been reported - 5,937,636,063 cycles # 2.916 GHz - 12,374,083,331 instructions # 2.08 insn per cycle - 2.091996677 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 3.987731 sec + 12,196,113,186 cycles # 2.857 GHz + 20,128,416,611 instructions # 1.65 insn per cycle + 4.324838531 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.149439e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.150073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.150179e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 2.066345 sec -INFO: No Floating Point Exceptions have been reported - 6,803,203,568 cycles # 2.918 GHz - 14,656,096,283 instructions # 2.15 insn per cycle - 2.390130877 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 76 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849635e-03 -Avg ME (F77/GPU) = 9.8712451931260159E-003 -Relative difference = 0.0021940095370046923 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849634e-03 +Avg ME (F77/GPU) = 9.8712442618092543E-003 +Relative difference = 0.00219401673293189 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.548424e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.548685e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.548685e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.179003 sec -INFO: No Floating Point Exceptions have been reported - 18,168,840,210 cycles # 2.939 GHz - 53,911,011,794 instructions # 2.97 insn per cycle - 6.183081263 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.736079e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.736198e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.736198e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825039e-06 ) GeV^-6 +TOTAL : 9.206588 sec + 26,269,096,419 cycles # 2.853 GHz + 55,380,240,679 instructions # 2.11 insn per cycle + 9.210704438 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:64102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087551509E-003 -Relative difference = 2.119780432912131e-08 +Avg ME (F77/C++) = 9.8479612086552360E-003 +Relative difference = 2.118765858747532e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.395658e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.396067e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.396067e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.556967 sec -INFO: No Floating Point Exceptions have been reported - 4,597,936,627 cycles # 2.947 GHz - 13,808,300,252 instructions # 3.00 insn per cycle - 1.560798930 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:97016) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.147645e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.147829e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.147829e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825036e-06 ) GeV^-6 +TOTAL : 2.461099 sec + 7,001,514,428 cycles # 2.841 GHz + 14,820,625,172 instructions # 2.12 insn per cycle + 2.465392683 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:223175) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896367235E-003 -Relative difference = 3.1515505172940424e-08 +Avg ME (F77/C++) = 9.8479546895692046E-003 +Relative difference = 3.152236130775542e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.833708e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.835461e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.835461e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.292815e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.293506e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.293506e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.774770 sec -INFO: No Floating Point Exceptions have been reported - 2,127,367,774 cycles # 2.734 GHz - 4,836,875,487 instructions # 2.27 insn per cycle - 0.778636721 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85494) (512y: 0) (512z: 0) +TOTAL : 1.232942 sec + 3,262,019,930 cycles # 2.638 GHz + 5,311,976,962 instructions # 1.63 insn per cycle + 1.236967483 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:163023) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091246E-003 -Relative difference = 1.8588029579156084e-08 +Avg ME (F77/C++) = 9.8929728160406412E-003 +Relative difference = 1.859495200918643e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.729108e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.731291e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.731291e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.647089e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.648057e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.648057e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.685221 sec -INFO: No Floating Point Exceptions have been reported - 1,884,703,570 cycles # 2.737 GHz - 4,291,263,737 instructions # 2.28 insn per cycle - 0.689203509 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81183) (512y: 45) (512z: 0) +TOTAL : 1.139064 sec + 3,024,353,048 cycles # 2.648 GHz + 4,830,348,422 instructions # 1.60 insn per cycle + 1.143303421 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162700) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091246E-003 -Relative difference = 1.8588029579156084e-08 +Avg ME (F77/C++) = 9.8929728160406412E-003 +Relative difference = 1.859495200918643e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.870048e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.872187e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.872187e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.771101 sec -INFO: No Floating Point Exceptions have been reported - 1,354,646,750 cycles # 1.748 GHz - 2,162,779,823 instructions # 1.60 insn per cycle - 0.775438585 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3481) (512y: 45) (512z:79330) +EvtsPerSec[Rmb+ME] (23) = ( 4.940916e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.942061e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.942061e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 1.072015 sec + 1,787,397,501 cycles # 1.662 GHz + 2,421,351,557 instructions # 1.35 insn per cycle + 1.076381120 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3752) (512y: 5) (512z:174186) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982676284E-003 -Relative difference = 2.004124217057488e-08 +Avg ME (F77/C++) = 9.8929811982706624E-003 +Relative difference = 2.0041548855825715e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 09d523a948..e897ea9b26 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,242 +25,189 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:42:06 +DATE: 2025-09-24_09:02:53 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.291704e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.296560e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.296560e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187093e-05 +- 9.825663e-06 ) GeV^-6 -TOTAL : 1.680127 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 5,675,085,647 cycles # 2.923 GHz - 11,509,492,893 instructions # 2.03 insn per cycle - 1.997903242 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +EvtsPerSec[Rmb+ME] (23) = ( 2.665296e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.665912e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.665912e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825665e-06 ) GeV^-6 +TOTAL : 3.960752 sec + 12,066,914,409 cycles # 2.855 GHz + 20,727,627,711 instructions # 1.72 insn per cycle + 4.281530016 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= +==PROF== Profiling "diagram1": launch__registers_per_thread 76 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.120892e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.132073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.132073e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856440e-04 +- 8.331091e-05 ) GeV^-6 -TOTAL : 2.037220 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 6,712,310,342 cycles # 2.924 GHz - 13,777,135,261 instructions # 2.05 insn per cycle - 2.354099539 seconds time elapsed +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849635e-03 -Avg ME (F77/GPU) = 9.8712451931260159E-003 -Relative difference = 0.0021940095370046923 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849634e-03 +Avg ME (F77/GPU) = 9.8712442618092543E-003 +Relative difference = 0.00219401673293189 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.574125e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.574397e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.574397e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.159980 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 18,121,008,944 cycles # 2.940 GHz - 53,916,989,652 instructions # 2.98 insn per cycle - 6.164330765 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.618409e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.618525e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.618525e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825039e-06 ) GeV^-6 +TOTAL : 9.400011 sec + 26,716,526,883 cycles # 2.842 GHz + 55,382,608,020 instructions # 2.07 insn per cycle + 9.404255165 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:64102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087551509E-003 -Relative difference = 2.119780432912131e-08 +Avg ME (F77/C++) = 9.8479612086552360E-003 +Relative difference = 2.118765858747532e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.371688e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.372089e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.372089e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.568419 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,624,959,734 cycles # 2.942 GHz - 13,809,578,618 instructions # 2.99 insn per cycle - 1.572870258 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:97016) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.157410e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.157582e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.157582e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825036e-06 ) GeV^-6 +TOTAL : 2.450151 sec + 6,971,837,515 cycles # 2.841 GHz + 14,822,399,094 instructions # 2.13 insn per cycle + 2.454549843 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:223175) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896367235E-003 -Relative difference = 3.1515505172940424e-08 +Avg ME (F77/C++) = 9.8479546895692046E-003 +Relative difference = 3.152236130775542e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.853120e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.854860e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.854860e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.206747e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.207489e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.207489e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.772760 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,127,660,282 cycles # 2.740 GHz - 4,839,303,130 instructions # 2.27 insn per cycle - 0.777110537 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85494) (512y: 0) (512z: 0) +TOTAL : 1.259294 sec + 3,331,791,928 cycles # 2.638 GHz + 5,313,983,443 instructions # 1.59 insn per cycle + 1.263735048 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:163023) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091246E-003 -Relative difference = 1.8588029579156084e-08 +Avg ME (F77/C++) = 9.8929728160406412E-003 +Relative difference = 1.859495200918643e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.707103e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.709607e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.709607e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.558259e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.559041e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.559041e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.687680 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,889,891,398 cycles # 2.733 GHz - 4,293,271,631 instructions # 2.27 insn per cycle - 0.692031150 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81183) (512y: 45) (512z: 0) +TOTAL : 1.161340 sec + 3,067,192,699 cycles # 2.633 GHz + 4,832,392,668 instructions # 1.58 insn per cycle + 1.165831525 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162700) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091246E-003 -Relative difference = 1.8588029579156084e-08 +Avg ME (F77/C++) = 9.8929728160406412E-003 +Relative difference = 1.859495200918643e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.738421e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.740575e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.740575e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.785848 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,358,106,687 cycles # 1.720 GHz - 2,165,384,980 instructions # 1.59 insn per cycle - 0.790493646 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3481) (512y: 45) (512z:79330) +EvtsPerSec[Rmb+ME] (23) = ( 4.955195e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.956327e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.956327e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 1.069338 sec + 1,788,653,322 cycles # 1.667 GHz + 2,423,301,522 instructions # 1.35 insn per cycle + 1.073851131 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3752) (512y: 5) (512z:174186) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982676284E-003 -Relative difference = 2.004124217057488e-08 +Avg ME (F77/C++) = 9.8929811982706624E-003 +Relative difference = 2.0041548855825715e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 33a64296d4..0bbb0770e4 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,218 +25,181 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:14:03 +DATE: 2025-09-24_08:13:17 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.196404e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.197145e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.197475e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.664150e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.664292e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.664334e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.762965 sec -INFO: No Floating Point Exceptions have been reported - 5,951,937,078 cycles # 2.924 GHz - 11,910,577,864 instructions # 2.00 insn per cycle - 2.092003198 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 3.986879 sec + 12,144,476,957 cycles # 2.854 GHz + 20,149,489,558 instructions # 1.66 insn per cycle + 4.310343323 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.150073e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.150749e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.150840e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 -TOTAL : 2.074025 sec -INFO: No Floating Point Exceptions have been reported - 6,857,187,374 cycles # 2.930 GHz - 14,190,515,168 instructions # 2.07 insn per cycle - 2.396988151 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 76 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 66 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849635e-03 -Avg ME (F77/GPU) = 9.8712451931260107E-003 -Relative difference = 0.0021940095370041636 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849634e-03 +Avg ME (F77/GPU) = 9.8712442618092543E-003 +Relative difference = 0.00219401673293189 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.597266e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.597536e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.597536e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.144692 sec -INFO: No Floating Point Exceptions have been reported - 18,086,727,911 cycles # 2.942 GHz - 53,895,836,183 instructions # 2.98 insn per cycle - 6.148512893 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.631133e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.631247e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.631247e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825039e-06 ) GeV^-6 +TOTAL : 9.378655 sec + 26,695,300,181 cycles # 2.846 GHz + 55,334,943,323 instructions # 2.07 insn per cycle + 9.382767082 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:63872) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087572898E-003 -Relative difference = 2.1198021522715588e-08 +Avg ME (F77/C++) = 9.8479612086536921E-003 +Relative difference = 2.118750181351027e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.388656e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.389069e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.389069e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.560721 sec -INFO: No Floating Point Exceptions have been reported - 4,571,260,015 cycles # 2.924 GHz - 13,800,942,063 instructions # 3.02 insn per cycle - 1.564719207 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96651) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.145880e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.146047e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.146047e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825036e-06 ) GeV^-6 +TOTAL : 2.462822 sec + 6,999,876,095 cycles # 2.839 GHz + 14,819,093,247 instructions # 2.12 insn per cycle + 2.466962797 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:223171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896065809E-003 -Relative difference = 3.151856596628469e-08 +Avg ME (F77/C++) = 9.8479546895692046E-003 +Relative difference = 3.152236130775542e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.702410e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.704003e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.704003e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.229766e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.230442e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.230442e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.789887 sec -INFO: No Floating Point Exceptions have been reported - 2,151,012,254 cycles # 2.712 GHz - 4,840,938,021 instructions # 2.25 insn per cycle - 0.793816354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85884) (512y: 0) (512z: 0) +TOTAL : 1.251478 sec + 3,312,244,464 cycles # 2.640 GHz + 5,310,879,695 instructions # 1.60 insn per cycle + 1.255633722 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162949) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091923E-003 -Relative difference = 1.85880227405429e-08 +Avg ME (F77/C++) = 9.8929728160406412E-003 +Relative difference = 1.859495200918643e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.657646e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.659745e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.659745e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.640122e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.640921e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.640921e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.691425 sec -INFO: No Floating Point Exceptions have been reported - 1,894,431,690 cycles # 2.727 GHz - 4,294,884,277 instructions # 2.27 insn per cycle - 0.695223368 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81725) (512y: 25) (512z: 0) +TOTAL : 1.140652 sec + 3,025,617,561 cycles # 2.645 GHz + 4,829,183,559 instructions # 1.60 insn per cycle + 1.144897027 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162624) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161091923E-003 -Relative difference = 1.85880227405429e-08 +Avg ME (F77/C++) = 9.8929728160406412E-003 +Relative difference = 1.859495200918643e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.673392e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.675470e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.675470e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.793743 sec -INFO: No Floating Point Exceptions have been reported - 1,366,656,580 cycles # 1.715 GHz - 2,169,713,805 instructions # 1.59 insn per cycle - 0.797745119 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4092) (512y: 32) (512z:79551) +EvtsPerSec[Rmb+ME] (23) = ( 4.947037e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.948324e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.948324e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 1.069982 sec + 1,785,300,398 cycles # 1.663 GHz + 2,420,313,481 instructions # 1.36 insn per cycle + 1.074364371 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3680) (512y: 5) (512z:174186) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982957326E-003 -Relative difference = 2.0044082998332894e-08 +Avg ME (F77/C++) = 9.8929811982706624E-003 +Relative difference = 2.0041548855825715e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..3d5df91db0 --- /dev/null +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling @@ -0,0 +1,118 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +make: Nothing to be done for 'all'. + +DATE: 2025-09-24_08:25:12 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +2.624115e+02 1 256 +5.156633e+02 2 256 +1.004282e+03 4 256 +1.930573e+03 8 256 +3.544034e+03 16 256 +4.917510e+03 32 256 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +### GPU: scaling test 32 +3.375082e+01 1 32 +6.739138e+01 2 32 +1.340518e+02 4 32 +2.656472e+02 8 32 +5.223225e+02 16 32 +1.024733e+03 32 32 +1.958721e+03 64 32 +3.529263e+03 128 32 +4.944572e+03 256 32 +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +check_cuda.exe: Assertion `code == gpuSuccess' failed. +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.195066e+01 1 256 +5.199377e+01 2 256 +5.165247e+01 4 256 +### CPU: scaling test 32 +5.192832e+01 1 32 +5.136516e+01 2 32 +5.204074e+01 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.023888e+02 1 256 +1.025918e+02 2 256 +1.018366e+02 4 256 +### CPU: scaling test 32 +1.028217e+02 1 32 +1.009493e+02 2 32 +1.030107e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.229650e+02 1 256 +2.211888e+02 2 256 +2.176932e+02 4 256 +### CPU: scaling test 32 +2.234860e+02 1 32 +2.229706e+02 2 32 +2.240995e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.407797e+02 1 256 +2.442304e+02 2 256 +2.440472e+02 4 256 +### CPU: scaling test 32 +2.394156e+02 1 32 +2.430410e+02 2 32 +2.406664e+02 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.619663e+02 1 256 +2.606580e+02 2 256 +2.595136e+02 4 256 +### CPU: scaling test 32 +2.631129e+02 1 32 +2.603531e+02 2 32 +2.606993e+02 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index a2a6307c02..2f143fdbaa 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,215 +25,178 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:10:06 +DATE: 2025-09-24_08:05:00 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.665934e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.666477e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.666666e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.202831 sec -INFO: No Floating Point Exceptions have been reported - 7,373,914,452 cycles # 2.913 GHz - 16,351,055,335 instructions # 2.22 insn per cycle - 2.588547453 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.634681e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.634859e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.634901e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.911742 sec + 11,945,203,786 cycles # 2.853 GHz + 19,561,426,283 instructions # 1.64 insn per cycle + 4.244700831 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.110897e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111188e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111222e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.442430 sec -INFO: No Floating Point Exceptions have been reported - 11,070,694,428 cycles # 2.924 GHz - 25,628,142,124 instructions # 2.31 insn per cycle - 3.841933628 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 114 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/GPU) = 9.8722598175428403E-003 +Relative difference = 3.223634904086631e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.567548e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.567783e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.567783e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.168610e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.168705e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.168705e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.978728 sec -INFO: No Floating Point Exceptions have been reported - 19,201,924,470 cycles # 2.751 GHz - 54,137,446,015 instructions # 2.82 insn per cycle - 6.982563293 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32000) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 10.202641 sec + 27,008,826,144 cycles # 2.647 GHz + 55,489,845,751 instructions # 2.05 insn per cycle + 10.206796166 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:82767) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.526848e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.526939e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.526939e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.016248e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.016285e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.016285e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.460419 sec -INFO: No Floating Point Exceptions have been reported - 9,442,620,757 cycles # 2.727 GHz - 26,188,001,033 instructions # 2.77 insn per cycle - 3.464377416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96049) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.198364 sec + 14,820,744,558 cycles # 2.849 GHz + 28,052,770,640 instructions # 1.89 insn per cycle + 5.202377269 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:222782) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.548969e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.549418e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.549418e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.173574e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.173760e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.173760e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.489614 sec -INFO: No Floating Point Exceptions have been reported - 4,075,741,004 cycles # 2.731 GHz - 9,249,825,182 instructions # 2.27 insn per cycle - 1.493453651 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +TOTAL : 2.431646 sec + 6,422,559,220 cycles # 2.638 GHz + 10,106,796,156 instructions # 1.57 insn per cycle + 2.435786868 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162525) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.098256e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.098850e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.098850e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.407760e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.407969e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.407969e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.290484 sec -INFO: No Floating Point Exceptions have been reported - 3,523,951,603 cycles # 2.724 GHz - 8,183,239,467 instructions # 2.32 insn per cycle - 1.294382992 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80015) (512y: 80) (512z: 0) +TOTAL : 2.195988 sec + 5,811,914,222 cycles # 2.643 GHz + 9,159,041,975 instructions # 1.58 insn per cycle + 2.200185899 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162162) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.495372e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.495944e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.495944e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.551665e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.551966e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.551966e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.513924 sec -INFO: No Floating Point Exceptions have been reported - 2,658,314,764 cycles # 1.752 GHz - 4,173,156,780 instructions # 1.57 insn per cycle - 1.517996809 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 92) (512z:78910) +TOTAL : 2.072914 sec + 3,457,103,850 cycles # 1.665 GHz + 4,585,756,837 instructions # 1.33 insn per cycle + 2.077304614 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3149) (512y: 61) (512z:174183) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 67fff86657..bead3fdf60 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,6 +10,7 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make: Nothing to be done for 'all'. @@ -21,215 +25,178 @@ make: Nothing to be done for 'all'. make: Nothing to be done for 'all'. -DATE: 2024-10-06_09:11:33 +DATE: 2025-09-24_08:07:54 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.667678e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.668217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.668387e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.202686 sec -INFO: No Floating Point Exceptions have been reported - 7,336,606,843 cycles # 2.899 GHz - 15,241,236,080 instructions # 2.08 insn per cycle - 2.586897924 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.631204e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.631383e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.631422e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.909015 sec + 11,924,587,141 cycles # 2.856 GHz + 19,714,302,266 instructions # 1.65 insn per cycle + 4.229466301 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.107552e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107855e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107889e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.440073 sec -INFO: No Floating Point Exceptions have been reported - 11,052,276,434 cycles # 2.923 GHz - 25,411,180,343 instructions # 2.30 insn per cycle - 3.836365671 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 116 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 98 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/GPU) = 9.8722598175428403E-003 +Relative difference = 3.223634904086631e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 7.653903e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.654105e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.654105e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.175853e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.175960e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.175960e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.889827 sec -INFO: No Floating Point Exceptions have been reported - 19,201,166,017 cycles # 2.786 GHz - 54,161,677,415 instructions # 2.82 insn per cycle - 6.893652512 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32202) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 10.191887 sec + 27,066,562,296 cycles # 2.655 GHz + 55,432,709,827 instructions # 2.05 insn per cycle + 10.196042947 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:82721) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.552412e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.552503e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.552503e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.014775e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.014811e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.014811e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.403221 sec -INFO: No Floating Point Exceptions have been reported - 9,295,420,050 cycles # 2.729 GHz - 26,089,296,035 instructions # 2.81 insn per cycle - 3.407123949 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95935) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.204912 sec + 14,810,174,964 cycles # 2.844 GHz + 28,051,432,256 instructions # 1.89 insn per cycle + 5.208973811 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:222585) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.556434e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.556900e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.556900e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.185805e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.185996e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.185996e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.488620 sec -INFO: No Floating Point Exceptions have been reported - 4,059,104,235 cycles # 2.721 GHz - 9,213,839,753 instructions # 2.27 insn per cycle - 1.492560916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83864) (512y: 0) (512z: 0) +TOTAL : 2.419296 sec + 6,397,293,732 cycles # 2.640 GHz + 10,105,068,555 instructions # 1.58 insn per cycle + 2.423434285 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162459) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.125241e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.125840e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.125840e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.353455e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.353659e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.353659e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.282211 sec -INFO: No Floating Point Exceptions have been reported - 3,511,408,538 cycles # 2.732 GHz - 8,168,208,932 instructions # 2.33 insn per cycle - 1.286095846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79421) (512y: 230) (512z: 0) +TOTAL : 2.246140 sec + 5,927,041,948 cycles # 2.635 GHz + 9,157,270,352 instructions # 1.54 insn per cycle + 2.250289465 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:162094) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.517573e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.518129e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.518129e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.574266e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.574580e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.574580e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.503444 sec -INFO: No Floating Point Exceptions have been reported - 2,622,176,822 cycles # 1.740 GHz - 4,167,750,292 instructions # 1.59 insn per cycle - 1.507552292 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1879) (512y: 174) (512z:78884) +TOTAL : 2.053659 sec + 3,414,953,228 cycles # 1.660 GHz + 4,584,587,859 instructions # 1.34 insn per cycle + 2.057980570 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3085) (512y: 61) (512z:174183) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling new file mode 100644 index 0000000000..0c9d4dd187 --- /dev/null +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-09-24_08:20:53 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +6.114382e+05 1 256 +1.184804e+06 2 256 +2.295799e+06 4 256 +4.299152e+06 8 256 +6.781749e+06 16 256 +6.234987e+06 32 256 +4.824464e+06 64 256 +5.008277e+06 128 256 +5.002497e+06 256 256 +5.105111e+06 512 256 +5.144414e+06 1024 256 +### GPU: scaling test 32 +7.442778e+04 1 32 +1.705130e+05 2 32 +3.324580e+05 4 32 +6.361197e+05 8 32 +1.276311e+06 16 32 +2.506315e+06 32 32 +4.622242e+06 64 32 +6.583981e+06 128 32 +6.111331e+06 256 32 +4.906391e+06 512 32 +5.038809e+06 1024 32 +4.994793e+06 2048 32 +5.116613e+06 4096 32 +5.145819e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.883864e+04 1 256 +1.012169e+05 2 256 +1.022878e+05 4 256 +### CPU: scaling test 32 +9.538686e+04 1 32 +9.747153e+04 2 32 +9.954397e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.671310e+05 1 256 +1.759690e+05 2 256 +1.760443e+05 4 256 +### CPU: scaling test 32 +1.499075e+05 1 32 +1.562893e+05 2 32 +1.522789e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.113537e+05 1 256 +3.215123e+05 2 256 +2.807500e+05 4 256 +### CPU: scaling test 32 +2.961647e+05 1 32 +2.870406e+05 2 32 +2.943923e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +3.149885e+05 1 256 +3.152403e+05 2 256 +3.263600e+05 4 256 +### CPU: scaling test 32 +3.402265e+05 1 32 +3.182797e+05 2 32 +3.367517e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.236251e+05 1 256 +2.336378e+05 2 256 +2.275319e+05 4 256 +### CPU: scaling test 32 +2.341184e+05 1 32 +2.208305e+05 2 32 +2.274892e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 468f6865a8..a3350c15b3 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:05:51 +DATE: 2025-09-24_07:56:49 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.906944e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.902591e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.013821e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.369412e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.858836e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.893773e+06 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.458221 sec -INFO: No Floating Point Exceptions have been reported - 1,930,997,109 cycles # 2.858 GHz - 2,724,198,211 instructions # 1.41 insn per cycle - 0.805328419 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.493507 sec + 2,094,463,287 cycles # 2.825 GHz + 2,949,447,803 instructions # 1.41 insn per cycle + 0.799505845 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.002453e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.463176e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.675243e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.544639 sec -INFO: No Floating Point Exceptions have been reported - 2,250,691,324 cycles # 2.871 GHz - 3,190,813,390 instructions # 1.42 insn per cycle - 0.843484638 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 80 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482467490466 Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.052668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.075406e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.075406e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.011082e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.032940e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.578445 sec -INFO: No Floating Point Exceptions have been reported - 4,629,037,835 cycles # 2.928 GHz - 13,193,545,970 instructions # 2.85 insn per cycle - 1.584589009 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.641008 sec + 4,728,634,035 cycles # 2.877 GHz + 13,391,733,023 instructions # 2.83 insn per cycle + 1.644935255 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 751) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.869817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.940106e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.940106e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.734169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.798767e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.798767e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.895982 sec -INFO: No Floating Point Exceptions have been reported - 2,636,174,950 cycles # 2.931 GHz - 7,556,706,256 instructions # 2.87 insn per cycle - 0.901753059 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.964620 sec + 2,778,184,561 cycles # 2.870 GHz + 7,810,599,541 instructions # 2.81 insn per cycle + 0.968650361 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3027) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.170738e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.377041e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.377041e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.999275e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.191668e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.191668e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.538337 sec -INFO: No Floating Point Exceptions have been reported - 1,492,365,440 cycles # 2.760 GHz - 3,161,633,609 instructions # 2.12 insn per cycle - 0.543901971 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2991) (512y: 0) (512z: 0) +TOTAL : 0.566602 sec + 1,524,796,725 cycles # 2.675 GHz + 3,239,090,759 instructions # 2.12 insn per cycle + 0.570567687 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.502118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.753079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.753079e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.186463e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.401752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.401752e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.488254 sec -INFO: No Floating Point Exceptions have been reported - 1,345,193,436 cycles # 2.734 GHz - 3,015,805,712 instructions # 2.24 insn per cycle - 0.494320620 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 104) (512z: 0) +TOTAL : 0.534680 sec + 1,443,469,002 cycles # 2.683 GHz + 3,113,047,726 instructions # 2.16 insn per cycle + 0.538728930 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2729) (512y: 51) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.340176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.450488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.450488e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.232076e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.339196e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.339196e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.722488 sec -INFO: No Floating Point Exceptions have been reported - 1,326,137,037 cycles # 1.826 GHz - 1,964,340,659 instructions # 1.48 insn per cycle - 0.728328312 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1379) (512y: 106) (512z: 2218) +TOTAL : 0.756449 sec + 1,339,965,318 cycles # 1.764 GHz + 1,936,048,197 instructions # 1.44 insn per cycle + 0.760584848 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 75) (512z: 2387) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index a32e85fd77..f54f9f771f 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,272 +10,220 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:39:10 +DATE: 2025-09-24_08:57:37 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.313371e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.590831e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.590831e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.054488e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.251839e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.251839e+06 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.487212 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,046,207,140 cycles # 2.880 GHz - 3,015,907,255 instructions # 1.47 insn per cycle - 0.769534809 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.519849 sec + 2,167,632,544 cycles # 2.822 GHz + 3,174,291,906 instructions # 1.46 insn per cycle + 0.825102214 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +==PROF== Profiling "diagram1": launch__registers_per_thread 80 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.228660e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.270938e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.270938e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.758730 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,917,079,859 cycles # 2.883 GHz - 4,489,082,127 instructions # 1.54 insn per cycle - 1.069078440 seconds time elapsed +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482467490466 Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.058535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081557e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.081557e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.006346e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.028061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028061e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.574537 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,656,483,821 cycles # 2.950 GHz - 13,198,201,576 instructions # 2.83 insn per cycle - 1.579077435 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.657332 sec + 4,772,691,593 cycles # 2.873 GHz + 13,397,727,029 instructions # 2.81 insn per cycle + 1.661626356 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 751) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.861172e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.931943e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.931943e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.722779e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.788329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.788329e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.907508 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,678,662,656 cycles # 2.939 GHz - 7,605,263,564 instructions # 2.84 insn per cycle - 0.912202227 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.979997 sec + 2,824,593,348 cycles # 2.871 GHz + 7,859,167,507 instructions # 2.78 insn per cycle + 0.984421558 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3027) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.153263e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.357026e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.357026e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.894342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.081563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.081563e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.547067 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,524,781,245 cycles # 2.767 GHz - 3,210,388,287 instructions # 2.11 insn per cycle - 0.551691801 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2991) (512y: 0) (512z: 0) +TOTAL : 0.595363 sec + 1,574,100,751 cycles # 2.628 GHz + 3,287,987,619 instructions # 2.09 insn per cycle + 0.599823612 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.508777e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.767060e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.767060e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.137842e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.355195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.355195e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.494747 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,383,177,469 cycles # 2.773 GHz - 3,064,481,068 instructions # 2.22 insn per cycle - 0.499446571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2749) (512y: 104) (512z: 0) +TOTAL : 0.552181 sec + 1,491,380,997 cycles # 2.683 GHz + 3,163,058,147 instructions # 2.12 insn per cycle + 0.556586621 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2729) (512y: 51) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.351157e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.462501e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.462501e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.211794e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.320275e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320275e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.725065 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,357,891,290 cycles # 1.863 GHz - 2,000,455,329 instructions # 1.47 insn per cycle - 0.729577819 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1379) (512y: 106) (512z: 2218) +TOTAL : 0.771732 sec + 1,381,853,144 cycles # 1.782 GHz + 1,973,253,744 instructions # 1.43 insn per cycle + 0.776188665 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 75) (512z: 2387) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 67eac99bab..df00e76677 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:04 +DATE: 2025-09-24_07:57:13 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.866343e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.840904e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.947003e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.374602e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.850583e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.886296e+06 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.463809 sec -INFO: No Floating Point Exceptions have been reported - 1,942,418,108 cycles # 2.861 GHz - 2,721,411,859 instructions # 1.40 insn per cycle - 0.812650633 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.494162 sec + 2,088,078,987 cycles # 2.820 GHz + 2,936,679,024 instructions # 1.41 insn per cycle + 0.798930083 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.997280e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.399599e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.603946e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.538885 sec -INFO: No Floating Point Exceptions have been reported - 2,239,160,610 cycles # 2.873 GHz - 3,203,384,758 instructions # 1.43 insn per cycle - 0.836856412 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 80 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482467490466 Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.060643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.083213e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.083213e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.011141e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032924e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.032924e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.565121 sec -INFO: No Floating Point Exceptions have been reported - 4,623,795,988 cycles # 2.948 GHz - 13,181,888,102 instructions # 2.85 insn per cycle - 1.571833324 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 692) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.640764 sec + 4,726,378,497 cycles # 2.875 GHz + 13,388,410,697 instructions # 2.83 insn per cycle + 1.644936414 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 746) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499481 Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.878003e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.949625e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.949625e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.733886e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.797913e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.797913e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.892385 sec -INFO: No Floating Point Exceptions have been reported - 2,641,116,720 cycles # 2.947 GHz - 7,555,506,374 instructions # 2.86 insn per cycle - 0.899472366 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3093) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.964517 sec + 2,777,828,465 cycles # 2.870 GHz + 7,808,249,189 instructions # 2.81 insn per cycle + 0.968518484 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3022) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499475 Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.178148e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.383095e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.383095e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.010096e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.204120e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.204120e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.535311 sec -INFO: No Floating Point Exceptions have been reported - 1,491,222,481 cycles # 2.767 GHz - 3,161,019,864 instructions # 2.12 insn per cycle - 0.541387025 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2976) (512y: 0) (512z: 0) +TOTAL : 0.564633 sec + 1,525,088,852 cycles # 2.684 GHz + 3,238,233,498 instructions # 2.12 insn per cycle + 0.568864130 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2894) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.523592e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.778898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.778898e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.162469e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.377065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.377065e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.485060 sec -INFO: No Floating Point Exceptions have been reported - 1,349,314,232 cycles # 2.763 GHz - 3,012,812,614 instructions # 2.23 insn per cycle - 0.489068736 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2726) (512y: 104) (512z: 0) +TOTAL : 0.538225 sec + 1,445,712,987 cycles # 2.669 GHz + 3,111,497,224 instructions # 2.15 insn per cycle + 0.542313151 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2713) (512y: 51) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.347943e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.459729e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.459729e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.241673e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.349775e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.349775e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.720112 sec -INFO: No Floating Point Exceptions have been reported - 1,326,103,986 cycles # 1.833 GHz - 1,962,664,460 instructions # 1.48 insn per cycle - 0.726078775 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1356) (512y: 106) (512z: 2218) +TOTAL : 0.753239 sec + 1,337,847,804 cycles # 1.768 GHz + 1,935,134,575 instructions # 1.45 insn per cycle + 0.757158715 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1524) (512y: 75) (512z: 2387) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492589 Relative difference = 5.286901348574438e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling new file mode 100644 index 0000000000..246da016ac --- /dev/null +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-09-24_08:21:36 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +6.410931e+05 1 256 +1.277149e+06 2 256 +2.468261e+06 4 256 +4.940702e+06 8 256 +8.703302e+06 16 256 +1.240333e+07 32 256 +1.176459e+07 64 256 +1.135181e+07 128 256 +1.232077e+07 256 256 +1.259780e+07 512 256 +1.305679e+07 1024 256 +### GPU: scaling test 32 +8.109047e+04 1 32 +1.670604e+05 2 32 +3.290091e+05 4 32 +6.250900e+05 8 32 +1.280682e+06 16 32 +2.510148e+06 32 32 +5.005487e+06 64 32 +8.591415e+06 128 32 +1.209171e+07 256 32 +1.153836e+07 512 32 +1.111319e+07 1024 32 +1.175548e+07 2048 32 +1.195470e+07 4096 32 +1.227044e+07 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.905691e+04 1 256 +1.034121e+05 2 256 +1.047026e+05 4 256 +### CPU: scaling test 32 +9.038706e+04 1 32 +9.069344e+04 2 32 +1.041458e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.561891e+05 1 256 +2.730252e+05 2 256 +2.836441e+05 4 256 +### CPU: scaling test 32 +2.501329e+05 1 32 +2.570952e+05 2 32 +2.554758e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +5.750509e+05 1 256 +6.287586e+05 2 256 +6.269094e+05 4 256 +### CPU: scaling test 32 +5.674363e+05 1 32 +5.794110e+05 2 32 +5.625484e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +6.160214e+05 1 256 +6.181977e+05 2 256 +6.142127e+05 4 256 +### CPU: scaling test 32 +6.002739e+05 1 32 +6.110485e+05 2 32 +4.928480e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +4.823290e+05 1 256 +4.859783e+05 2 256 +4.896342e+05 4 256 +### CPU: scaling test 32 +4.753134e+05 1 32 +4.306893e+05 2 32 +4.931574e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index fa95ebd131..0aa114dc67 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:45 +DATE: 2025-09-24_07:58:23 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.818001e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.982501e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.122889e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.029737e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.178192e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192162e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.452282 sec -INFO: No Floating Point Exceptions have been reported - 1,920,727,034 cycles # 2.860 GHz - 2,694,517,558 instructions # 1.40 insn per cycle - 0.728408510 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 169 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.470353 sec + 2,024,797,775 cycles # 2.821 GHz + 2,795,474,884 instructions # 1.38 insn per cycle + 0.775690419 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.287877e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.320334e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.683236e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.495314 sec -INFO: No Floating Point Exceptions have been reported - 2,079,539,950 cycles # 2.850 GHz - 2,952,237,418 instructions # 1.42 insn per cycle - 0.786339466 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 54 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 50 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487904286338 -Relative difference = 0.0003670698531228044 +Avg ME (F77/GPU) = 0.14247487915986667 +Relative difference = 0.00036707067464478155 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.109567e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.134660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.134660e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.041265e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.064683e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064683e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.494759 sec -INFO: No Floating Point Exceptions have been reported - 4,403,081,916 cycles # 2.940 GHz - 12,951,948,710 instructions # 2.94 insn per cycle - 1.498420981 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 645) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.592374 sec + 4,584,720,385 cycles # 2.874 GHz + 13,173,886,622 instructions # 2.87 insn per cycle + 1.596196458 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246861273719524 -Relative difference = 8.940352641194861e-08 +Avg ME (F77/C++) = 0.14246861273712064 +Relative difference = 8.940300273875148e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.886806e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.066754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.066754e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.712937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.876014e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.876014e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.584675 sec -INFO: No Floating Point Exceptions have been reported - 1,726,276,919 cycles # 2.937 GHz - 4,542,407,737 instructions # 2.63 insn per cycle - 0.588476135 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3627) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.621667 sec + 1,792,962,809 cycles # 2.869 GHz + 4,663,415,265 instructions # 2.60 insn per cycle + 0.625726960 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3615) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246862329122401 Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.651382e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.346145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.346145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.565327e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.255556e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.255556e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.307816 sec -INFO: No Floating Point Exceptions have been reported - 856,647,676 cycles # 2.754 GHz - 1,917,830,464 instructions # 2.24 insn per cycle - 0.311794908 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3580) (512y: 0) (512z: 0) +TOTAL : 0.312612 sec + 851,138,638 cycles # 2.694 GHz + 1,911,731,220 instructions # 2.25 insn per cycle + 0.316680357 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3459) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.083995e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.890169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.890169e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.826176e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.593217e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.593217e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287118 sec -INFO: No Floating Point Exceptions have been reported - 801,284,784 cycles # 2.760 GHz - 1,834,043,941 instructions # 2.29 insn per cycle - 0.290894624 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3400) (512y: 22) (512z: 0) +TOTAL : 0.299534 sec + 816,595,199 cycles # 2.696 GHz + 1,843,533,646 instructions # 2.26 insn per cycle + 0.303518277 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3333) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.500723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.948038e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.948038e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.425940e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.876492e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.876492e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.384030 sec -INFO: No Floating Point Exceptions have been reported - 726,928,592 cycles # 1.877 GHz - 1,308,660,654 instructions # 1.80 insn per cycle - 0.387900268 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1964) (512y: 24) (512z: 2435) +TOTAL : 0.390882 sec + 717,534,817 cycles # 1.821 GHz + 1,247,139,337 instructions # 1.74 insn per cycle + 0.394851024 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2570) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491576758442 Relative difference = 1.1066920862943416e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 5a6a874489..e0303b7ed7 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,272 +10,220 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:39:24 +DATE: 2025-09-24_08:57:58 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.958276e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.362856e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.362856e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.669508e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.781327e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.781327e+06 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429183e+01 ) GeV^-2 -TOTAL : 0.467586 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,958,269,299 cycles # 2.868 GHz - 2,873,921,299 instructions # 1.47 insn per cycle - 0.741370031 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +TOTAL : 0.482541 sec + 2,080,194,939 cycles # 2.814 GHz + 2,973,065,127 instructions # 1.43 insn per cycle + 0.797267871 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 169 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= +==PROF== Profiling "diagram1": launch__registers_per_thread 54 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.867040e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.953002e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.953002e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609941e+02 +- 2.115589e+02 ) GeV^-2 -TOTAL : 0.638465 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 2,513,600,406 cycles # 2.877 GHz - 3,810,036,638 instructions # 1.52 insn per cycle - 0.930171723 seconds time elapsed +==PROF== Profiling "diagram2": launch__registers_per_thread 50 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487904286338 -Relative difference = 0.0003670698531228044 +Avg ME (F77/GPU) = 0.14247487915986667 +Relative difference = 0.00036707067464478155 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.115307e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.140507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.140507e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.033819e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.056984e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056984e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.490082 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 4,418,597,373 cycles # 2.958 GHz - 12,956,387,401 instructions # 2.93 insn per cycle - 1.494530314 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 645) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.608143 sec + 4,605,847,309 cycles # 2.859 GHz + 13,177,560,156 instructions # 2.86 insn per cycle + 1.612112149 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246861273719524 -Relative difference = 8.940352641194861e-08 +Avg ME (F77/C++) = 0.14246861273712064 +Relative difference = 8.940300273875148e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.871197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.051268e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.051268e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.693966e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.860851e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.860851e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.592243 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 1,749,393,716 cycles # 2.936 GHz - 4,590,457,409 instructions # 2.62 insn per cycle - 0.596762261 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3627) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.631429 sec + 1,821,115,428 cycles # 2.868 GHz + 4,710,567,012 instructions # 2.59 insn per cycle + 0.635587274 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3615) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246862329122401 Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.650062e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.340176e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.340176e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.470965e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.164241e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.164241e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.311783 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 875,769,738 cycles # 2.776 GHz - 1,954,803,706 instructions # 2.23 insn per cycle - 0.316080972 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3580) (512y: 0) (512z: 0) +TOTAL : 0.322773 sec + 876,800,208 cycles # 2.687 GHz + 1,948,167,782 instructions # 2.22 insn per cycle + 0.326941987 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3459) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.042794e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.845843e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.845843e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.766714e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.528672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.528672e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.293361 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 825,335,769 cycles # 2.779 GHz - 1,870,845,111 instructions # 2.27 insn per cycle - 0.297556229 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3400) (512y: 22) (512z: 0) +TOTAL : 0.306939 sec + 839,001,914 cycles # 2.704 GHz + 1,879,706,999 instructions # 2.24 insn per cycle + 0.310854322 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3333) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.484934e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.935540e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.935540e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.381614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.819126e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.819126e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.390040 sec -INFO: No Floating Point Exceptions have been reported -INFO: No Floating Point Exceptions have been reported - 749,752,693 cycles # 1.904 GHz - 1,350,296,093 instructions # 1.80 insn per cycle - 0.394449871 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1964) (512y: 24) (512z: 2435) +TOTAL : 0.399460 sec + 742,424,746 cycles # 1.843 GHz + 1,287,364,599 instructions # 1.73 insn per cycle + 0.403527066 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2075) (512y: 5) (512z: 2570) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491576758442 Relative difference = 1.1066920862943416e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index cea07bf7e8..bb4764fe5e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:58 +DATE: 2025-09-24_07:58:43 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.801672e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.945717e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.092440e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.032335e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184891e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.198749e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.453252 sec -INFO: No Floating Point Exceptions have been reported - 1,914,636,683 cycles # 2.859 GHz - 2,699,162,883 instructions # 1.41 insn per cycle - 0.727606605 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 169 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.471162 sec + 2,022,729,281 cycles # 2.819 GHz + 2,796,281,381 instructions # 1.38 insn per cycle + 0.775833852 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.322683e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.438723e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.801307e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 -TOTAL : 0.493317 sec -INFO: No Floating Point Exceptions have been reported - 2,100,361,107 cycles # 2.862 GHz - 2,955,351,040 instructions # 1.41 insn per cycle - 0.791031778 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 54 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 50 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247487904286338 -Relative difference = 0.0003670698531228044 +Avg ME (F77/GPU) = 0.14247487915986667 +Relative difference = 0.00036707067464478155 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.112466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.138003e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.138003e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.040661e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.064089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064089e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.490381 sec -INFO: No Floating Point Exceptions have been reported - 4,405,341,411 cycles # 2.950 GHz - 12,928,117,316 instructions # 2.93 insn per cycle - 1.494164072 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.592943 sec + 4,581,299,500 cycles # 2.870 GHz + 13,170,651,939 instructions # 2.87 insn per cycle + 1.596909897 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246861273719524 -Relative difference = 8.940352641194861e-08 +Avg ME (F77/C++) = 0.14246861273712064 +Relative difference = 8.940300273875148e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.897278e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.076728e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.076728e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.701985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.865053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.865053e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 0.582482 sec -INFO: No Floating Point Exceptions have been reported - 1,724,294,786 cycles # 2.945 GHz - 4,536,655,836 instructions # 2.63 insn per cycle - 0.586223274 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3611) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.624193 sec + 1,795,013,511 cycles # 2.860 GHz + 4,662,177,938 instructions # 2.60 insn per cycle + 0.628349991 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3609) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246862329122401 Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.690817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.397497e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.397497e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.294900e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.922794e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.922794e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.305315 sec -INFO: No Floating Point Exceptions have been reported - 857,155,838 cycles # 2.779 GHz - 1,914,615,212 instructions # 2.23 insn per cycle - 0.309003061 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3549) (512y: 0) (512z: 0) +TOTAL : 0.328192 sec + 851,932,871 cycles # 2.569 GHz + 1,910,738,870 instructions # 2.24 insn per cycle + 0.332201917 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3441) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.056800e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.870570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.870570e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.776657e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.523494e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.523494e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.288177 sec -INFO: No Floating Point Exceptions have been reported - 804,254,194 cycles # 2.761 GHz - 1,829,977,116 instructions # 2.28 insn per cycle - 0.291930002 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3364) (512y: 22) (512z: 0) +TOTAL : 0.301663 sec + 816,360,354 cycles # 2.677 GHz + 1,842,622,689 instructions # 2.26 insn per cycle + 0.305601859 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3313) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491543012991 Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.550897e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.994144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.994144e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.462455e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.903984e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.903984e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.380837 sec -INFO: No Floating Point Exceptions have been reported - 727,485,601 cycles # 1.894 GHz - 1,306,171,995 instructions # 1.80 insn per cycle - 0.384559776 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1928) (512y: 24) (512z: 2435) +TOTAL : 0.387603 sec + 714,478,577 cycles # 1.828 GHz + 1,245,972,447 instructions # 1.74 insn per cycle + 0.391617356 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2055) (512y: 5) (512z: 2570) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247491576758442 Relative difference = 1.1066920862943416e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling new file mode 100644 index 0000000000..f37650a768 --- /dev/null +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling @@ -0,0 +1,137 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= + +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) +OMPFLAGS= +FPTYPE='m' +HELINL='0' +HRDCOD='0' +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas +Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) +make: Nothing to be done for 'gtestlibs'. + +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppnone +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +make USEBUILDDIR=1 BACKEND=cpp512z +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' + +DATE: 2025-09-24_08:21:14 + +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe +### GPU: scaling test 256 +6.159458e+05 1 256 +1.208237e+06 2 256 +2.338380e+06 4 256 +4.313977e+06 8 256 +6.563882e+06 16 256 +5.902898e+06 32 256 +4.772882e+06 64 256 +4.983398e+06 128 256 +5.014489e+06 256 256 +5.095821e+06 512 256 +5.127323e+06 1024 256 +### GPU: scaling test 32 +8.355877e+04 1 32 +1.680315e+05 2 32 +3.138740e+05 4 32 +6.033111e+05 8 32 +1.307680e+06 16 32 +2.486421e+06 32 32 +4.782041e+06 64 32 +6.585802e+06 128 32 +5.416696e+06 256 32 +4.888900e+06 512 32 +5.023808e+06 1024 32 +4.990474e+06 2048 32 +5.087796e+06 4096 32 +5.145084e+06 8192 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +9.882800e+04 1 256 +9.861797e+04 2 256 +1.017932e+05 4 256 +### CPU: scaling test 32 +8.655082e+04 1 32 +8.835276e+04 2 32 +9.886339e+04 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +1.755553e+05 1 256 +1.752340e+05 2 256 +1.807067e+05 4 256 +### CPU: scaling test 32 +1.569051e+05 1 32 +1.602524e+05 2 32 +1.672982e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.888491e+05 1 256 +3.042597e+05 2 256 +3.124640e+05 4 256 +### CPU: scaling test 32 +3.269877e+05 1 32 +3.302374e+05 2 32 +3.240244e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.938152e+05 1 256 +3.510081e+05 2 256 +3.508687e+05 4 256 +### CPU: scaling test 32 +2.931611e+05 1 32 +3.253190e+05 2 32 +3.534994e+05 4 32 +========================================================================= +scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe +### CPU: scaling test 256 +2.357370e+05 1 256 +2.318594e+05 2 256 +2.369869e+05 4 256 +### CPU: scaling test 32 +2.357587e+05 1 32 +2.340439e+05 2 32 +2.340131e+05 4 32 +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index cb0b82e9a4..b2e458fe3f 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:18 +DATE: 2025-09-24_07:57:38 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.883484e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.876597e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.990293e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.375747e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.851582e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.885501e+06 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.458412 sec -INFO: No Floating Point Exceptions have been reported - 1,935,066,146 cycles # 2.866 GHz - 2,699,989,812 instructions # 1.40 insn per cycle - 0.733387527 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.492978 sec + 2,090,896,556 cycles # 2.821 GHz + 2,929,508,251 instructions # 1.40 insn per cycle + 0.798572309 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.013974e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.497451e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.709351e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.541801 sec -INFO: No Floating Point Exceptions have been reported - 2,287,504,645 cycles # 2.883 GHz - 3,220,826,671 instructions # 1.41 insn per cycle - 0.850636557 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 80 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482577104625 Relative difference = 5.209967070245855e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.050634e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.073472e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.073472e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.006109e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.027526e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.027526e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.580404 sec -INFO: No Floating Point Exceptions have been reported - 4,643,189,098 cycles # 2.932 GHz - 13,180,741,468 instructions # 2.84 insn per cycle - 1.584505840 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.649213 sec + 4,749,858,129 cycles # 2.875 GHz + 13,378,773,035 instructions # 2.82 insn per cycle + 1.653102536 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 751) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.871761e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.941517e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.941517e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.760665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.827247e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.827247e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.895197 sec -INFO: No Floating Point Exceptions have been reported - 2,647,990,030 cycles # 2.947 GHz - 7,474,565,418 instructions # 2.82 insn per cycle - 0.899253220 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3152) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.950509 sec + 2,735,086,948 cycles # 2.867 GHz + 7,712,753,081 instructions # 2.82 insn per cycle + 0.954580383 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3103) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247482733329694 +Relative difference = 5.100316128927506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.201825e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.415489e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.415489e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.009534e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.203444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.203444e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.532048 sec -INFO: No Floating Point Exceptions have been reported - 1,472,019,476 cycles # 2.748 GHz - 3,129,064,583 instructions # 2.13 insn per cycle - 0.536341858 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3133) (512y: 0) (512z: 0) +TOTAL : 0.564962 sec + 1,496,426,483 cycles # 2.634 GHz + 3,194,660,103 instructions # 2.13 insn per cycle + 0.568933388 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.569463e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.831852e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.831852e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.240557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.464165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.464165e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.479328 sec -INFO: No Floating Point Exceptions have been reported - 1,320,483,901 cycles # 2.736 GHz - 2,983,197,107 instructions # 2.26 insn per cycle - 0.483280271 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2895) (512y: 110) (512z: 0) +TOTAL : 0.525679 sec + 1,411,813,396 cycles # 2.668 GHz + 3,068,209,697 instructions # 2.17 insn per cycle + 0.529681189 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2857) (512y: 57) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.268192e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372574e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372574e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.244340e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.353724e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.353724e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.745303 sec -INFO: No Floating Point Exceptions have been reported - 1,365,795,021 cycles # 1.824 GHz - 1,991,870,632 instructions # 1.46 insn per cycle - 0.749335143 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1679) (512y: 108) (512z: 2251) +TOTAL : 0.752766 sec + 1,326,592,697 cycles # 1.755 GHz + 1,920,429,812 instructions # 1.45 insn per cycle + 0.756964252 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1504) (512y: 61) (512z: 2443) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 222758fe32..cbf9654863 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-10-06_09:06:32 +DATE: 2025-09-24_07:58:02 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.879429e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.807541e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.902111e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.398861e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.873691e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.908270e+06 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.462530 sec -INFO: No Floating Point Exceptions have been reported - 1,930,179,746 cycles # 2.847 GHz - 2,724,788,037 instructions # 1.41 insn per cycle - 0.736957830 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.494658 sec + 2,092,705,523 cycles # 2.813 GHz + 2,940,970,011 instructions # 1.41 insn per cycle + 0.801023593 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.958663e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.373563e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.576283e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.543078 sec -INFO: No Floating Point Exceptions have been reported - 2,226,045,922 cycles # 2.831 GHz - 3,151,460,121 instructions # 1.42 insn per cycle - 0.843097781 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 80 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 74 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424749e-01 Avg ME (F77/GPU) = 0.14247482577104625 Relative difference = 5.209967070245855e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.049471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.072251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072251e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.997666e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.021105e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.021105e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.581795 sec -INFO: No Floating Point Exceptions have been reported - 4,647,850,638 cycles # 2.932 GHz - 13,168,659,581 instructions # 2.83 insn per cycle - 1.585735048 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 666) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.659397 sec + 4,758,579,494 cycles # 2.862 GHz + 13,376,500,790 instructions # 2.81 insn per cycle + 1.663454307 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 746) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.863863e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934907e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934907e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.766458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.832690e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.832690e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.898950 sec -INFO: No Floating Point Exceptions have been reported - 2,647,565,316 cycles # 2.935 GHz - 7,477,127,209 instructions # 2.82 insn per cycle - 0.902852166 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3141) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.946987 sec + 2,726,321,707 cycles # 2.869 GHz + 7,710,654,552 instructions # 2.83 insn per cycle + 0.951101922 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3098) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 +Avg ME (F77/C++) = 0.14247482733329694 +Relative difference = 5.100316128927506e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.193877e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.403471e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.403471e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.061100e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.260572e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.260572e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.533092 sec -INFO: No Floating Point Exceptions have been reported - 1,474,101,191 cycles # 2.747 GHz - 3,129,731,788 instructions # 2.12 insn per cycle - 0.537323582 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3111) (512y: 0) (512z: 0) +TOTAL : 0.555271 sec + 1,496,591,734 cycles # 2.679 GHz + 3,193,488,754 instructions # 2.13 insn per cycle + 0.559342364 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3027) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.595782e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.860984e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.860984e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.243284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.466918e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.466918e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.475643 sec -INFO: No Floating Point Exceptions have been reported - 1,319,166,719 cycles # 2.754 GHz - 2,983,572,989 instructions # 2.26 insn per cycle - 0.479589426 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2871) (512y: 110) (512z: 0) +TOTAL : 0.525318 sec + 1,415,387,088 cycles # 2.677 GHz + 3,066,968,624 instructions # 2.17 insn per cycle + 0.529321884 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2841) (512y: 57) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.265955e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372021e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372021e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.256974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.366147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.366147e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.745301 sec -INFO: No Floating Point Exceptions have been reported - 1,365,993,831 cycles # 1.825 GHz - 1,991,757,917 instructions # 1.46 insn per cycle - 0.749395729 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 108) (512z: 2251) +TOTAL : 0.748391 sec + 1,323,661,803 cycles # 1.761 GHz + 1,919,358,353 instructions # 1.45 insn per cycle + 0.752535445 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1488) (512y: 61) (512z: 2443) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 +Avg ME (F77/C++) = 0.14247482641080925 +Relative difference = 5.165063512315125e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 9b3f75797b..1bf17797d0 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:01:13 +DATE: 2025-09-24_09:36:21 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.147069e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.778623e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.394888e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.507440e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.405568e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.464017e+06 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.535117 sec -INFO: No Floating Point Exceptions have been reported - 2,222,375,781 cycles # 2.890 GHz - 3,181,150,200 instructions # 1.43 insn per cycle - 0.828824866 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 228 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.640440 sec + 2,809,701,143 cycles # 2.827 GHz + 4,494,636,207 instructions # 1.60 insn per cycle + 1.053147110 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 84 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 94 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134710926110280 Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.628496e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.666122e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.666122e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.576789e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.612176e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.612176e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.554723 sec -INFO: No Floating Point Exceptions have been reported - 19,293,957,259 cycles # 2.941 GHz - 51,936,518,995 instructions # 2.69 insn per cycle - 6.561734499 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.761250 sec + 19,468,040,698 cycles # 2.878 GHz + 52,866,873,079 instructions # 2.72 insn per cycle + 6.766705663 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 705) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.914767e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.044981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.044981e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.667809e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.780884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.780884e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.713846 sec -INFO: No Floating Point Exceptions have been reported - 10,942,394,234 cycles # 2.942 GHz - 30,809,451,561 instructions # 2.82 insn per cycle - 3.720459537 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2915) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.041971 sec + 11,631,270,997 cycles # 2.874 GHz + 32,470,600,741 instructions # 2.79 insn per cycle + 4.047477918 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.701521e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.038587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.038587e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.446930e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.756366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.756366e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.349075 sec -INFO: No Floating Point Exceptions have been reported - 6,518,044,155 cycles # 2.767 GHz - 13,691,830,614 instructions # 2.10 insn per cycle - 2.356266703 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2941) (512y: 0) (512z: 0) +TOTAL : 2.469249 sec + 6,658,104,030 cycles # 2.692 GHz + 13,758,243,349 instructions # 2.07 insn per cycle + 2.474625313 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3017) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.169544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.582169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.582169e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.710312e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.054677e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.054677e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.151067 sec -INFO: No Floating Point Exceptions have been reported - 5,973,431,908 cycles # 2.768 GHz - 13,032,735,919 instructions # 2.18 insn per cycle - 2.158817844 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2667) (512y: 146) (512z: 0) +TOTAL : 2.336944 sec + 6,266,032,072 cycles # 2.676 GHz + 13,326,118,877 instructions # 2.13 insn per cycle + 2.342347316 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2845) (512y: 49) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.442417e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.620453e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.620453e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.171604e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.321823e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.321823e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.166223 sec -INFO: No Floating Point Exceptions have been reported - 5,879,580,303 cycles # 1.853 GHz - 8,614,888,302 instructions # 1.47 insn per cycle - 3.173636028 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1506) (512y: 128) (512z: 1946) +TOTAL : 3.416630 sec + 6,077,886,706 cycles # 1.777 GHz + 8,372,353,704 instructions # 1.38 insn per cycle + 3.422247419 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1562) (512y: 71) (512z: 2101) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index fe94934cb0..274cf5b9f8 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:01:39 +DATE: 2025-09-24_09:36:59 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.155696e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.811430e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.416776e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.495635e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.395599e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.454803e+06 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.531667 sec -INFO: No Floating Point Exceptions have been reported - 2,222,115,079 cycles # 2.893 GHz - 3,196,008,298 instructions # 1.44 insn per cycle - 0.825144177 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 216 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.641751 sec + 2,814,921,189 cycles # 2.828 GHz + 4,486,858,306 instructions # 1.59 insn per cycle + 1.054520115 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 85 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 94 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134710926110280 Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.710634e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.751435e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.751435e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.649275e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.688267e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.688267e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.246482 sec -INFO: No Floating Point Exceptions have been reported - 18,390,828,933 cycles # 2.942 GHz - 50,070,723,541 instructions # 2.72 insn per cycle - 6.253313848 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.468573 sec + 18,605,591,168 cycles # 2.874 GHz + 50,910,754,622 instructions # 2.74 insn per cycle + 6.474203070 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 635) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.069031e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.214398e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.214398e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.706155e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.823800e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.823800e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.535906 sec -INFO: No Floating Point Exceptions have been reported - 10,415,008,507 cycles # 2.940 GHz - 29,198,189,749 instructions # 2.80 insn per cycle - 3.543300262 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2733) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.985573 sec + 11,479,073,541 cycles # 2.877 GHz + 31,953,213,577 instructions # 2.78 insn per cycle + 3.991237832 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3206) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.327920e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.613203e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.613203e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.472224e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.788973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.788973e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.541514 sec -INFO: No Floating Point Exceptions have been reported - 7,032,477,509 cycles # 2.760 GHz - 15,175,173,386 instructions # 2.16 insn per cycle - 2.548867076 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3020) (512y: 0) (512z: 0) +TOTAL : 2.455792 sec + 6,602,535,558 cycles # 2.684 GHz + 13,513,270,122 instructions # 2.05 insn per cycle + 2.461238995 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2933) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.529226e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.840126e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.840126e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.769500e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.125335e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.125335e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.433970 sec -INFO: No Floating Point Exceptions have been reported - 6,732,593,285 cycles # 2.759 GHz - 14,647,151,783 instructions # 2.18 insn per cycle - 2.441354685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2621) (512y: 302) (512z: 0) +TOTAL : 2.308652 sec + 6,213,743,119 cycles # 2.688 GHz + 13,125,333,077 instructions # 2.11 insn per cycle + 2.314078971 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2770) (512y: 49) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.326729e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.490201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.490201e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.175393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326517e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.326517e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.271504 sec -INFO: No Floating Point Exceptions have been reported - 6,070,928,941 cycles # 1.852 GHz - 10,360,391,243 instructions # 1.71 insn per cycle - 3.278977914 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1268) (512y: 214) (512z: 2129) +TOTAL : 3.412394 sec + 6,060,342,842 cycles # 1.774 GHz + 8,345,690,212 instructions # 1.38 insn per cycle + 3.418017007 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 71) (512z: 2096) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 8cd2c74f38..5df5f2e017 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,242 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:03:02 +DATE: 2025-09-24_09:38:46 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.465620e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510965e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.608079e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.672770e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850350e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.862111e+07 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.492110 sec -INFO: No Floating Point Exceptions have been reported - 2,084,727,455 cycles # 2.877 GHz - 2,955,736,176 instructions # 1.42 insn per cycle - 0.784112386 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 131 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.546796 sec + 2,347,767,839 cycles # 2.827 GHz + 3,518,985,878 instructions # 1.50 insn per cycle + 0.889028718 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 60 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 64 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695491848513 -Relative difference = 4.162503792787837e-05 +Avg ME (F77/GPU) = 4.3136696910951287 +Relative difference = 4.165793710634106e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.686557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.727704e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.727704e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.603617e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.641812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.641812e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.305463 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 18,635,302,225 cycles # 2.953 GHz - 51,219,407,083 instructions # 2.75 insn per cycle - 6.310992251 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 625) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.628001 sec + 19,095,296,333 cycles # 2.879 GHz + 52,570,684,440 instructions # 2.75 insn per cycle + 6.633186048 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 694) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313574e+00 Avg ME (F77/C++) = 4.3135738277342170 Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.043062e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.307407e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.307407e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.650976e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.873298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.873298e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.681205 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,932,523,130 cycles # 2.953 GHz - 19,317,767,787 instructions # 2.44 insn per cycle - 2.686665617 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.961111 sec + 8,533,073,654 cycles # 2.878 GHz + 20,129,758,419 instructions # 2.36 insn per cycle + 2.966298075 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3870) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313572e+00 Avg ME (F77/C++) = 4.3135722697479650 Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.901471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.926003e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.926003e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.018131e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.075080e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.075080e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.413719 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,953,020,388 cycles # 2.786 GHz - 8,832,668,299 instructions # 2.23 insn per cycle - 1.419629254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3715) (512y: 0) (512z: 0) +TOTAL : 1.395955 sec + 3,785,475,901 cycles # 2.703 GHz + 8,259,140,872 instructions # 2.18 insn per cycle + 1.401234976 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3564) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645242873579 -Relative difference = 1.1028294269894893e-07 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.392997e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.544307e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.544307e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.448216e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.626503e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.626503e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.337803 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,735,491,375 cycles # 2.782 GHz - 8,430,906,889 instructions # 2.26 insn per cycle - 1.343508069 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3541) (512y: 20) (512z: 0) +TOTAL : 1.329067 sec + 3,609,245,916 cycles # 2.707 GHz + 8,052,417,503 instructions # 2.23 insn per cycle + 1.334245097 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3450) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645242873579 -Relative difference = 1.1028294269894893e-07 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.024352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.578236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.578236e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.458703e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.125669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.125669e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.827995 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,508,723,607 cycles # 1.915 GHz - 6,244,798,669 instructions # 1.78 insn per cycle - 1.833521857 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2325) (512y: 22) (512z: 2290) +TOTAL : 1.712295 sec + 3,171,872,574 cycles # 1.848 GHz + 5,491,453,610 instructions # 1.73 insn per cycle + 1.717258512 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2147) (512y: 5) (512z: 2290) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135643536224961 -Relative difference = 8.197919301304478e-08 +Avg ME (F77/C++) = 4.3135642320849001 +Relative difference = 5.380351369373482e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 1ff1d26090..ef6aa4541d 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,246 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:03:24 +DATE: 2025-09-24_09:39:18 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.690902e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.615208e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.727767e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.671134e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.848561e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.860377e+07 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.493976 sec -INFO: No Floating Point Exceptions have been reported - 2,066,790,877 cycles # 2.843 GHz - 2,969,404,210 instructions # 1.44 insn per cycle - 0.785535997 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 125 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.546514 sec + 2,342,088,165 cycles # 2.823 GHz + 3,542,032,932 instructions # 1.51 insn per cycle + 0.888457994 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 60 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 64 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695491848513 -Relative difference = 4.162503792787837e-05 +Avg ME (F77/GPU) = 4.3136696910951287 +Relative difference = 4.165793710634106e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.736131e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.779781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.779781e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.641763e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.681776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.681776e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 6.127979 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 18,032,140,147 cycles # 2.940 GHz - 49,602,643,371 instructions # 2.75 insn per cycle - 6.133935412 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 613) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.475013 sec + 18,644,002,819 cycles # 2.878 GHz + 51,007,439,568 instructions # 2.74 insn per cycle + 6.480509328 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 665) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313574e+00 Avg ME (F77/C++) = 4.3135738277342170 Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.506367e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.839198e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.839198e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.684584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.908730e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.908730e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.414203 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,115,995,603 cycles # 2.942 GHz - 18,533,869,751 instructions # 2.60 insn per cycle - 2.419892180 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3252) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.934573 sec + 8,454,211,342 cycles # 2.877 GHz + 19,835,014,469 instructions # 2.35 insn per cycle + 2.939713714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3811) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313572e+00 Avg ME (F77/C++) = 4.3135722697479650 Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.374488e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.825683e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.825683e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.065369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.131864e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.131864e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.037733 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 5,644,200,229 cycles # 2.763 GHz - 10,848,148,808 instructions # 1.92 insn per cycle - 2.043741542 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4274) (512y: 0) (512z: 0) +TOTAL : 1.387082 sec + 3,764,013,420 cycles # 2.705 GHz + 8,137,849,027 instructions # 2.16 insn per cycle + 1.392187853 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3474) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645242873579 -Relative difference = 1.1028294269894893e-07 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.433283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.894901e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.894901e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.480903e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.668132e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.668132e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.017462 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 5,594,464,289 cycles # 2.767 GHz - 10,554,918,385 instructions # 1.89 insn per cycle - 2.022782231 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4138) (512y: 12) (512z: 0) +TOTAL : 1.323763 sec + 3,586,004,374 cycles # 2.700 GHz + 7,955,445,372 instructions # 2.22 insn per cycle + 1.328913707 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3370) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645242873579 -Relative difference = 1.1028294269894893e-07 +Avg ME (F77/C++) = 4.3135645270813257 +Relative difference = 1.096352260831459e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.364066e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.648223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.648223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.488913e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.159953e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.159953e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 2.491143 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,639,687,839 cycles # 1.859 GHz - 8,661,216,579 instructions # 1.87 insn per cycle - 2.496647539 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2799) (512y: 0) (512z: 2885) +TOTAL : 1.704713 sec + 3,166,186,323 cycles # 1.853 GHz + 5,481,656,768 instructions # 1.73 insn per cycle + 1.709836890 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2090) (512y: 5) (512z: 2290) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135643536224961 -Relative difference = 8.197919301304478e-08 +Avg ME (F77/C++) = 4.3135642320849001 +Relative difference = 5.380351369373482e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 12c9da87af..eeeb59d618 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,246 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:02:06 +DATE: 2025-09-24_09:37:33 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.131914e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.755854e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.359452e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.517528e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.419175e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.478423e+06 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.535565 sec -INFO: No Floating Point Exceptions have been reported - 2,204,224,001 cycles # 2.864 GHz - 3,121,247,303 instructions # 1.42 insn per cycle - 0.828499405 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 228 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.634900 sec + 2,788,760,911 cycles # 2.835 GHz + 4,444,778,417 instructions # 1.59 insn per cycle + 1.042872548 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 84 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 94 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 -Avg ME (F77/GPU) = 4.3134711012809239 -Relative difference = 2.0835166567625394e-07 +Avg ME (F77/GPU) = 4.3134712562812831 +Relative difference = 1.7241765260874332e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.529079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.561968e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.561968e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.478269e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.509767e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.509767e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.973239 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 20,550,139,482 cycles # 2.945 GHz - 51,941,635,065 instructions # 2.53 insn per cycle - 6.980082779 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.204058 sec + 20,755,829,028 cycles # 2.880 GHz + 52,917,427,628 instructions # 2.55 insn per cycle + 7.209554620 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 705) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778082178 -Relative difference = 1.906102050071626e-07 +Avg ME (F77/C++) = 4.3134711782756741 +Relative difference = 1.9050183377028104e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.672019e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.782339e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.782339e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.556152e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660192e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 4.043433 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 11,521,778,322 cycles # 2.845 GHz - 30,615,090,868 instructions # 2.66 insn per cycle - 4.050715703 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2972) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.213212 sec + 12,129,215,180 cycles # 2.876 GHz + 32,047,748,942 instructions # 2.64 insn per cycle + 4.218582177 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3309) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778082178 -Relative difference = 1.906102050071626e-07 +Avg ME (F77/C++) = 4.3134711778081822 +Relative difference = 1.9061021324348284e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.474164e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.781347e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.781347e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.348524e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.643676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.643676e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.469295 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,758,530,167 cycles # 2.729 GHz - 13,653,357,404 instructions # 2.02 insn per cycle - 2.477625143 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3118) (512y: 0) (512z: 0) +TOTAL : 2.522477 sec + 6,765,170,166 cycles # 2.677 GHz + 13,545,443,870 instructions # 2.00 insn per cycle + 2.527903273 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3162) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.946193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312777e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.635693e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.969889e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.969889e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.239110 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,206,585,291 cycles # 2.765 GHz - 13,005,835,459 instructions # 2.10 insn per cycle - 2.246664710 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2851) (512y: 150) (512z: 0) +TOTAL : 2.372202 sec + 6,403,479,611 cycles # 2.694 GHz + 13,110,521,732 instructions # 2.05 insn per cycle + 2.377605412 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2989) (512y: 53) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.130780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.276017e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.276017e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.140442e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.288410e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.288410e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.470623 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,429,525,372 cycles # 1.849 GHz - 8,729,822,669 instructions # 1.36 insn per cycle - 3.478318009 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1792) (512y: 130) (512z: 2014) +TOTAL : 3.450244 sec + 6,142,491,223 cycles # 1.779 GHz + 8,289,889,012 instructions # 1.35 insn per cycle + 3.455551171 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1581) (512y: 61) (512z: 2145) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index 90c964242c..2222bb6c1d 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,246 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-10-06_10:02:34 +DATE: 2025-09-24_09:38:11 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.143359e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.817002e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.430401e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.474103e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.371663e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.430529e+06 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.533031 sec -INFO: No Floating Point Exceptions have been reported - 2,222,154,822 cycles # 2.885 GHz - 3,215,427,054 instructions # 1.45 insn per cycle - 0.826924367 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 216 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.639804 sec + 2,818,874,599 cycles # 2.836 GHz + 4,491,683,880 instructions # 1.59 insn per cycle + 1.052963598 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 85 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 94 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 -Avg ME (F77/GPU) = 4.3134711012809239 -Relative difference = 2.0835166567625394e-07 +Avg ME (F77/GPU) = 4.3134712562812831 +Relative difference = 1.7241765260874332e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.616471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.652773e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.652773e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.543744e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.577982e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.577982e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.603326 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 19,494,406,226 cycles # 2.950 GHz - 49,966,413,800 instructions # 2.56 insn per cycle - 6.609959024 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 599) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.903615 sec + 19,875,040,203 cycles # 2.878 GHz + 50,962,768,917 instructions # 2.56 insn per cycle + 6.909360746 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 635) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778082178 -Relative difference = 1.906102050071626e-07 +Avg ME (F77/C++) = 4.3134711782756741 +Relative difference = 1.9050183377028104e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.890177e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.018164e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.018164e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.566397e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.670719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.670719e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.745798 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 11,068,643,232 cycles # 2.950 GHz - 29,164,471,893 instructions # 2.63 insn per cycle - 3.753005329 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2815) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.197287 sec + 11,967,434,460 cycles # 2.849 GHz + 31,529,543,072 instructions # 2.63 insn per cycle + 4.202854098 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134711778082178 -Relative difference = 1.906102050071626e-07 +Avg ME (F77/C++) = 4.3134711778081822 +Relative difference = 1.9061021324348284e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.744994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.955254e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.955254e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.400358e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.703188e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.703188e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.917714 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 8,087,123,435 cycles # 2.766 GHz - 15,210,355,188 instructions # 1.88 insn per cycle - 2.924634632 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3203) (512y: 0) (512z: 0) +TOTAL : 2.493209 sec + 6,717,361,260 cycles # 2.689 GHz + 13,300,799,614 instructions # 1.98 insn per cycle + 2.498736225 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3078) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.909194e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.140218e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.140218e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.702572e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.045593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.045593e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.798673 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,730,347,780 cycles # 2.756 GHz - 14,498,978,915 instructions # 1.88 insn per cycle - 2.805768338 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2775) (512y: 304) (512z: 0) +TOTAL : 2.340637 sec + 6,332,876,477 cycles # 2.700 GHz + 12,910,869,466 instructions # 2.04 insn per cycle + 2.346037040 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2914) (512y: 53) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.049249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.186111e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.186111e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.136297e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.283420e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.283420e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.561293 sec -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,578,699,260 cycles # 1.844 GHz - 9,927,155,424 instructions # 1.51 insn per cycle - 3.569129809 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1565) (512y: 216) (512z: 2216) +TOTAL : 3.453468 sec + 6,141,795,664 cycles # 1.776 GHz + 8,267,854,036 instructions # 1.35 insn per cycle + 3.458910469 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1536) (512y: 61) (512z: 2140) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 } -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134712319139954 -Relative difference = 1.7806676491157786e-07 +Avg ME (F77/C++) = 4.3134712322699498 +Relative difference = 1.7798424336580573e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 2b34ea67ad..9e7ebf842e 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:07 +DATE: 2025-09-24_09:33:21 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.760509e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.779507e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.782702e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.189977e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.210295e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.213498e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.473450 sec -INFO: No Floating Point Exceptions have been reported - 1,994,326,240 cycles # 2.874 GHz - 2,845,102,706 instructions # 1.43 insn per cycle - 0.753810347 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.535283 sec + 2,207,411,095 cycles # 2.818 GHz + 3,160,583,611 instructions # 1.43 insn per cycle + 0.840627614 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.019067e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.126130e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.133988e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.490494 sec -INFO: No Floating Point Exceptions have been reported - 2,031,600,016 cycles # 2.857 GHz - 2,995,319,726 instructions # 1.47 insn per cycle - 0.772627668 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 104 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 76 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 Avg ME (F77/GPU) = 8.1274562860176604E-006 Relative difference = 3.3392753366481633e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.383469e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.386752e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.386752e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.307555e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.310855e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.310855e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.158264 sec -INFO: No Floating Point Exceptions have been reported - 469,342,334 cycles # 2.906 GHz - 1,390,298,076 instructions # 2.96 insn per cycle - 0.162106230 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3908) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.161929 sec + 471,695,181 cycles # 2.858 GHz + 1,386,120,911 instructions # 2.94 insn per cycle + 0.165662610 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4201) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167185E-006 Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.476358e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.488167e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.488167e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.839650e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.849559e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.849559e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.083612 sec -INFO: No Floating Point Exceptions have been reported - 240,584,825 cycles # 2.769 GHz - 693,113,903 instructions # 2.88 insn per cycle - 0.087424946 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9482) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.092454 sec + 255,607,375 cycles # 2.672 GHz + 718,366,684 instructions # 2.81 insn per cycle + 0.096275754 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.432068e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.438681e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.438681e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.315918e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.320963e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.320963e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038906 sec -INFO: No Floating Point Exceptions have been reported - 114,140,366 cycles # 2.711 GHz - 257,891,266 instructions # 2.26 insn per cycle - 0.042661267 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8501) (512y: 0) (512z: 0) +TOTAL : 0.042212 sec + 119,980,053 cycles # 2.646 GHz + 267,131,841 instructions # 2.23 insn per cycle + 0.046071538 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8995) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.618386e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.625883e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.625883e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.473800e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.480265e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.480265e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.034671 sec -INFO: No Floating Point Exceptions have been reported - 102,555,024 cycles # 2.705 GHz - 240,017,026 instructions # 2.34 insn per cycle - 0.038425016 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8143) (512y: 150) (512z: 0) +TOTAL : 0.037950 sec + 108,004,494 cycles # 2.627 GHz + 252,474,875 instructions # 2.34 insn per cycle + 0.041746760 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8827) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.192893e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198052e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198052e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.155565e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160261e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.160261e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.046494 sec -INFO: No Floating Point Exceptions have been reported - 90,048,800 cycles # 1.806 GHz - 134,302,710 instructions # 1.49 insn per cycle - 0.050438224 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1943) (512y: 126) (512z: 7086) +TOTAL : 0.047943 sec + 90,674,116 cycles # 1.777 GHz + 139,511,089 instructions # 1.54 insn per cycle + 0.051731540 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1725) (512y: 61) (512z: 8276) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index dc41fe503f..39c980daa5 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:18 +DATE: 2025-09-24_09:33:51 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.797107e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.816023e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.819423e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.219837e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.236573e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.242139e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.469966 sec -INFO: No Floating Point Exceptions have been reported - 2,001,057,465 cycles # 2.881 GHz - 2,930,552,926 instructions # 1.46 insn per cycle - 0.752195966 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.535604 sec + 2,215,769,027 cycles # 2.824 GHz + 3,174,955,614 instructions # 1.43 insn per cycle + 0.841816639 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.121137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.233030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.241027e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.489610 sec -INFO: No Floating Point Exceptions have been reported - 2,050,200,483 cycles # 2.873 GHz - 3,056,241,818 instructions # 1.49 insn per cycle - 0.771808178 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 108 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 76 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 Avg ME (F77/GPU) = 8.1274562860176604E-006 Relative difference = 3.3392753366481633e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.406266e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.409565e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.409565e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.293505e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.296702e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.296702e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.156466 sec -INFO: No Floating Point Exceptions have been reported - 465,689,745 cycles # 2.917 GHz - 1,385,079,930 instructions # 2.97 insn per cycle - 0.160315659 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3796) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.161859 sec + 469,852,209 cycles # 2.853 GHz + 1,378,582,277 instructions # 2.93 insn per cycle + 0.165480722 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4321) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167185E-006 Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.388983e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.401822e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.401822e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.784302e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.794144e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.794144e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.083799 sec -INFO: No Floating Point Exceptions have been reported - 238,961,924 cycles # 2.745 GHz - 689,073,758 instructions # 2.88 insn per cycle - 0.087593094 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9525) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.092403 sec + 253,996,770 cycles # 2.658 GHz + 712,985,517 instructions # 2.81 insn per cycle + 0.096121941 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12756) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.419818e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425419e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425419e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.317943e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323081e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323081e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038479 sec -INFO: No Floating Point Exceptions have been reported - 111,800,811 cycles # 2.682 GHz - 253,484,287 instructions # 2.27 insn per cycle - 0.042138594 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8457) (512y: 0) (512z: 0) +TOTAL : 0.041365 sec + 117,280,788 cycles # 2.633 GHz + 262,313,711 instructions # 2.24 insn per cycle + 0.045157630 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8941) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.620452e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.628839e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.628839e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.466443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.472654e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.472654e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.033872 sec -INFO: No Floating Point Exceptions have been reported - 100,998,379 cycles # 2.706 GHz - 235,641,730 instructions # 2.33 insn per cycle - 0.037957581 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8101) (512y: 150) (512z: 0) +TOTAL : 0.037331 sec + 106,205,166 cycles # 2.626 GHz + 247,558,941 instructions # 2.33 insn per cycle + 0.041028602 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8771) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.156678e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.161477e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.161477e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.142653e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147394e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147394e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.047111 sec -INFO: No Floating Point Exceptions have been reported - 88,066,978 cycles # 1.743 GHz - 129,735,533 instructions # 1.47 insn per cycle - 0.051105123 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1899) (512y: 126) (512z: 7084) +TOTAL : 0.047796 sec + 88,613,439 cycles # 1.738 GHz + 134,698,543 instructions # 1.52 insn per cycle + 0.051634230 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1669) (512y: 61) (512z: 8276) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 4b10dcf1d1..35461d8677 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:52 +DATE: 2025-09-24_09:35:22 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.214342e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.224285e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.226222e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.476842 sec -INFO: No Floating Point Exceptions have been reported - 1,989,613,876 cycles # 2.873 GHz - 2,928,089,356 instructions # 1.47 insn per cycle - 0.750924959 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.271224e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.283601e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.286909e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 +TOTAL : 0.534205 sec + 2,200,496,463 cycles # 2.812 GHz + 3,139,642,915 instructions # 1.43 insn per cycle + 0.840870842 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.950242e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.029144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.036217e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.020494e-03 +- 4.025605e-03 ) GeV^-4 -TOTAL : 0.473909 sec -INFO: No Floating Point Exceptions have been reported - 1,995,145,721 cycles # 2.886 GHz - 2,912,342,089 instructions # 1.46 insn per cycle - 0.748274226 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 70 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 50 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272869669930272E-006 -Relative difference = 4.548524165778887e-06 +Avg ME (F77/GPU) = 8.1272868580195144E-006 +Relative difference = 4.535115754234733e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.462777e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.466245e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.466245e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.392974e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.396413e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.396413e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.154509 sec -INFO: No Floating Point Exceptions have been reported - 463,950,135 cycles # 2.942 GHz - 1,382,102,782 instructions # 2.98 insn per cycle - 0.158280886 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3058) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.157853 sec + 458,926,719 cycles # 2.854 GHz + 1,383,056,756 instructions # 3.01 insn per cycle + 0.161530858 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3677) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105271212486E-006 -Relative difference = 5.8180333155894157e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127810e-06 +Avg ME (F77/C++) = 8.1278103111816235E-006 +Relative difference = 3.8286035741012934e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.221716e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.226773e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.226773e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.135252e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139079e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.139079e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.045148 sec -INFO: No Floating Point Exceptions have been reported - 132,927,826 cycles # 2.743 GHz - 372,156,154 instructions # 2.80 insn per cycle - 0.049041087 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:10141) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.048547 sec + 136,061,072 cycles # 2.633 GHz + 383,416,652 instructions # 2.82 insn per cycle + 0.052339343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13141) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127809e-06 -Avg ME (F77/C++) = 8.1278090510674588E-006 -Relative difference = 6.2830535070193674e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127808e-06 +Avg ME (F77/C++) = 8.1278084747868991E-006 +Relative difference = 5.841512231881029e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.776220e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.801025e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.801025e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.565076e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.586954e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.586954e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.021005 sec -INFO: No Floating Point Exceptions have been reported - 65,153,242 cycles # 2.690 GHz - 142,838,093 instructions # 2.19 insn per cycle - 0.024771930 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9241) (512y: 0) (512z: 0) +TOTAL : 0.022579 sec + 67,053,332 cycles # 2.610 GHz + 146,052,379 instructions # 2.18 insn per cycle + 0.026232857 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9511) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275366216540664E-006 -Relative difference = 4.655111786058001e-08 +Avg ME (F77/C++) = 8.1275365583690230E-006 +Relative difference = 5.4337614979978516e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.070417e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.098717e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.098717e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.767006e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.794756e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.794756e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.019184 sec -INFO: No Floating Point Exceptions have been reported - 60,296,621 cycles # 2.678 GHz - 132,772,434 instructions # 2.20 insn per cycle - 0.023065155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8959) (512y: 28) (512z: 0) +TOTAL : 0.021017 sec + 62,203,202 cycles # 2.571 GHz + 138,374,959 instructions # 2.22 insn per cycle + 0.024742594 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275366216540664E-006 -Relative difference = 4.655111786058001e-08 +Avg ME (F77/C++) = 8.1275365583690230E-006 +Relative difference = 5.4337614979978516e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.324469e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.345673e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.345673e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.276511e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298668e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298668e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.024875 sec -INFO: No Floating Point Exceptions have been reported - 52,411,208 cycles # 1.857 GHz - 79,637,147 instructions # 1.52 insn per cycle - 0.028776798 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2836) (512y: 30) (512z: 7437) +TOTAL : 0.025440 sec + 52,353,208 cycles # 1.824 GHz + 81,139,080 instructions # 1.55 insn per cycle + 0.029417169 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2361) (512y: 5) (512z: 8238) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275369863475849E-006 -Relative difference = 1.6797726498700304e-09 +Avg ME (F77/C++) = 8.1275370319141356E-006 +Relative difference = 3.926667622210349e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index 67a7328c67..1d521f46f1 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,251 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:01:02 +DATE: 2025-09-24_09:35:51 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.235104e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.244507e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.246621e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 -TOTAL : 0.477845 sec -INFO: No Floating Point Exceptions have been reported - 1,997,911,903 cycles # 2.876 GHz - 2,886,764,809 instructions # 1.44 insn per cycle - 0.753229194 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.303236e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.318225e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.321223e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 +TOTAL : 0.536056 sec + 2,212,356,220 cycles # 2.822 GHz + 3,186,924,366 instructions # 1.44 insn per cycle + 0.842335533 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.096496e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.193422e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.206590e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.020496e-03 +- 4.025606e-03 ) GeV^-4 -TOTAL : 0.477844 sec -INFO: No Floating Point Exceptions have been reported - 2,000,227,335 cycles # 2.879 GHz - 2,887,661,973 instructions # 1.44 insn per cycle - 0.753759254 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 70 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 50 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272866419447706E-006 -Relative difference = 4.508529302013153e-06 +Avg ME (F77/GPU) = 8.1272866414401287E-006 +Relative difference = 4.508467209435567e-06 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.435869e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.439325e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.439325e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.345209e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.348975e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.348975e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.154994 sec -INFO: No Floating Point Exceptions have been reported - 461,652,768 cycles # 2.918 GHz - 1,376,807,565 instructions # 2.98 insn per cycle - 0.158786297 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2930) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.159052 sec + 459,276,011 cycles # 2.840 GHz + 1,376,864,775 instructions # 3.00 insn per cycle + 0.162966642 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3781) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105271212486E-006 -Relative difference = 5.8180333155894157e-08 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127810e-06 +Avg ME (F77/C++) = 8.1278103112056928E-006 +Relative difference = 3.82889970907774e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.215601e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.220158e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.220158e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.145776e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.149797e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149797e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.044587 sec -INFO: No Floating Point Exceptions have been reported - 130,364,411 cycles # 2.725 GHz - 367,274,419 instructions # 2.82 insn per cycle - 0.048380365 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:10124) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.047450 sec + 133,795,452 cycles # 2.643 GHz + 378,404,458 instructions # 2.83 insn per cycle + 0.051234729 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13303) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 8.127809e-06 -Avg ME (F77/C++) = 8.1278090510674588E-006 -Relative difference = 6.2830535070193674e-09 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127808e-06 +Avg ME (F77/C++) = 8.1278084747868991E-006 +Relative difference = 5.841512231881029e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.799777e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.825160e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.825160e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.486627e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.506423e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.506423e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.020201 sec -INFO: No Floating Point Exceptions have been reported - 63,211,215 cycles # 2.704 GHz - 138,063,768 instructions # 2.18 insn per cycle - 0.023985955 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9196) (512y: 0) (512z: 0) +TOTAL : 0.022522 sec + 65,118,615 cycles # 2.531 GHz + 141,235,296 instructions # 2.17 insn per cycle + 0.026316984 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9456) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275366216540664E-006 -Relative difference = 4.655111786058001e-08 +Avg ME (F77/C++) = 8.1275365583690230E-006 +Relative difference = 5.4337614979978516e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.035669e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.062918e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.062918e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.749111e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.772359e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.772359e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.018625 sec -INFO: No Floating Point Exceptions have been reported - 57,993,332 cycles # 2.658 GHz - 127,990,808 instructions # 2.21 insn per cycle - 0.022353301 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8910) (512y: 28) (512z: 0) +TOTAL : 0.020450 sec + 59,809,769 cycles # 2.548 GHz + 133,453,298 instructions # 2.23 insn per cycle + 0.024051724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9326) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275366216540664E-006 -Relative difference = 4.655111786058001e-08 +Avg ME (F77/C++) = 8.1275365583690230E-006 +Relative difference = 5.4337614979978516e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.344103e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.363443e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.363443e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.272763e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.292384e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292384e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.024010 sec -INFO: No Floating Point Exceptions have been reported - 50,268,269 cycles # 1.840 GHz - 74,785,740 instructions # 1.49 insn per cycle - 0.027917015 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2791) (512y: 30) (512z: 7439) +TOTAL : 0.024694 sec + 50,063,265 cycles # 1.787 GHz + 76,311,691 instructions # 1.52 insn per cycle + 0.028520781 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2302) (512y: 5) (512z: 8341) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127537e-06 -Avg ME (F77/C++) = 8.1275369863475849E-006 -Relative difference = 1.6797726498700304e-09 +Avg ME (F77/C++) = 8.1275370319141356E-006 +Relative difference = 3.926667622210349e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 50cf2d796e..b19d440304 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:29 +DATE: 2025-09-24_09:34:21 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.754018e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.771557e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.774637e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.249673e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.266284e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.269736e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.469482 sec -INFO: No Floating Point Exceptions have been reported - 1,992,256,665 cycles # 2.872 GHz - 2,888,484,617 instructions # 1.45 insn per cycle - 0.750839241 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.536874 sec + 2,221,747,197 cycles # 2.822 GHz + 3,163,839,758 instructions # 1.42 insn per cycle + 0.846111338 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.962737e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.089994e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.098896e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.485052 sec -INFO: No Floating Point Exceptions have been reported - 2,027,704,407 cycles # 2.871 GHz - 3,029,735,278 instructions # 1.49 insn per cycle - 0.765353713 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 104 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 76 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 95 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562879405200E-006 -Relative difference = 3.3369094561706885e-07 +Avg ME (F77/GPU) = 8.1274562526495326E-006 +Relative difference = 3.380331376097252e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.401289e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404577e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404577e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.285769e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.288921e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.288921e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.157429 sec -INFO: No Floating Point Exceptions have been reported - 471,621,611 cycles # 2.936 GHz - 1,398,387,891 instructions # 2.97 insn per cycle - 0.161191989 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3899) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.162937 sec + 476,178,835 cycles # 2.866 GHz + 1,394,739,218 instructions # 2.93 insn per cycle + 0.166768119 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4201) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562948736117E-006 Relative difference = 3.32837900190667e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.729709e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.743939e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.743939e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.970683e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.990939e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.990939e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.080446 sec -INFO: No Floating Point Exceptions have been reported - 237,178,815 cycles # 2.833 GHz - 688,220,781 instructions # 2.90 insn per cycle - 0.084309693 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9334) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.090415 sec + 254,903,248 cycles # 2.722 GHz + 712,053,573 instructions # 2.79 insn per cycle + 0.094247271 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12853) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563175290919E-006 Relative difference = 3.3005037703909805e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.409119e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.415451e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.415451e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.321517e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.326552e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326552e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.039507 sec -INFO: No Floating Point Exceptions have been reported - 114,068,471 cycles # 2.665 GHz - 253,096,543 instructions # 2.22 insn per cycle - 0.043335126 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8363) (512y: 0) (512z: 0) +TOTAL : 0.042210 sec + 118,596,276 cycles # 2.612 GHz + 265,047,520 instructions # 2.23 insn per cycle + 0.046012448 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9126) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.680681e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.688641e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.688641e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.512441e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.518991e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.518991e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.033493 sec -INFO: No Floating Point Exceptions have been reported - 101,334,967 cycles # 2.753 GHz - 233,610,113 instructions # 2.31 insn per cycle - 0.037380618 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7501) (512y: 146) (512z: 0) +TOTAL : 0.037109 sec + 107,186,687 cycles # 2.649 GHz + 250,306,884 instructions # 2.34 insn per cycle + 0.040964805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8963) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.194656e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199944e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.199944e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.160470e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.165386e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.165386e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.046435 sec -INFO: No Floating Point Exceptions have been reported - 91,210,419 cycles # 1.827 GHz - 133,172,431 instructions # 1.46 insn per cycle - 0.050429905 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2084) (512y: 122) (512z: 6354) +TOTAL : 0.047844 sec + 90,399,024 cycles # 1.768 GHz + 138,717,973 instructions # 1.53 insn per cycle + 0.051641008 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1788) (512y: 61) (512z: 8314) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index e1fc789bed..70e3ca8e2d 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,248 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-10-06_10:00:41 +DATE: 2025-09-24_09:34:51 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.793622e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.811451e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.814397e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.236382e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.252209e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.255241e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.470592 sec -INFO: No Floating Point Exceptions have been reported - 1,997,502,547 cycles # 2.880 GHz - 2,923,476,215 instructions # 1.46 insn per cycle - 0.750818094 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.535997 sec + 2,217,885,936 cycles # 2.824 GHz + 3,191,898,077 instructions # 1.44 insn per cycle + 0.843021508 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.055830e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.165646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.173712e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.491474 sec -INFO: No Floating Point Exceptions have been reported - 2,044,918,526 cycles # 2.859 GHz - 3,006,189,896 instructions # 1.47 insn per cycle - 0.774360899 seconds time elapsed +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 108 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 76 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 95 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562879405200E-006 -Relative difference = 3.3369094561706885e-07 +Avg ME (F77/GPU) = 8.1274562526495326E-006 +Relative difference = 3.380331376097252e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.402707e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.406541e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.406541e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.263072e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.266213e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.266213e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.156609 sec -INFO: No Floating Point Exceptions have been reported - 468,766,259 cycles # 2.933 GHz - 1,393,706,102 instructions # 2.97 insn per cycle - 0.160398151 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3800) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.163259 sec + 474,783,021 cycles # 2.852 GHz + 1,387,157,133 instructions # 2.92 insn per cycle + 0.166971512 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4321) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562948736117E-006 Relative difference = 3.32837900190667e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.728046e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.740604e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.740604e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.056539e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.068459e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.068459e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.079699 sec -INFO: No Floating Point Exceptions have been reported - 235,148,851 cycles # 2.837 GHz - 684,201,633 instructions # 2.91 insn per cycle - 0.083458032 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9368) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.088414 sec + 250,802,322 cycles # 2.738 GHz + 707,025,863 instructions # 2.82 insn per cycle + 0.092101977 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12829) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563175290919E-006 Relative difference = 3.3005037703909805e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.447554e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.453499e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.453499e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.342626e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.347819e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.347819e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037781 sec -INFO: No Floating Point Exceptions have been reported - 111,660,471 cycles # 2.716 GHz - 248,651,696 instructions # 2.23 insn per cycle - 0.041691428 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8316) (512y: 0) (512z: 0) +TOTAL : 0.040651 sec + 115,737,876 cycles # 2.641 GHz + 260,102,580 instructions # 2.25 insn per cycle + 0.044409608 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9072) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.634149e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.641617e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.641617e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.458481e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.464627e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.464627e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.033571 sec -INFO: No Floating Point Exceptions have been reported - 99,219,938 cycles # 2.697 GHz - 229,292,514 instructions # 2.31 insn per cycle - 0.037291206 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7452) (512y: 146) (512z: 0) +TOTAL : 0.037592 sec + 106,299,204 cycles # 2.609 GHz + 245,391,200 instructions # 2.31 insn per cycle + 0.041387780 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8907) (512y: 45) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.191988e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.196872e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.196872e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.149106e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154011e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.154011e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.045809 sec -INFO: No Floating Point Exceptions have been reported - 88,834,257 cycles # 1.806 GHz - 128,615,199 instructions # 1.45 insn per cycle - 0.049747357 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2035) (512y: 122) (512z: 6355) +TOTAL : 0.047414 sec + 88,430,072 cycles # 1.747 GHz + 134,004,604 instructions # 1.52 insn per cycle + 0.051267395 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1732) (512y: 61) (512z: 8314) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index 107a77153b..12326e5c9f 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:58:55 +DATE: 2025-09-24_09:31:41 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.910300e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.325267e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.783205e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.500841e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.477366e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.559159e+07 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.523085 sec -INFO: No Floating Point Exceptions have been reported - 2,188,593,202 cycles # 2.883 GHz - 3,112,954,096 instructions # 1.42 insn per cycle - 0.817031478 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 130 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.549848 sec + 2,308,392,260 cycles # 2.828 GHz + 3,309,894,720 instructions # 1.43 insn per cycle + 0.873545190 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 50 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 94 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 -Avg ME (F77/GPU) = 0.14771956172964262 -Relative difference = 2.590743366698123e-07 +Avg ME (F77/GPU) = 0.14771956172964260 +Relative difference = 2.5907433685770594e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.066686e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.035589e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.035589e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.726949e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.681489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.681489e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.278019 sec -INFO: No Floating Point Exceptions have been reported - 3,764,987,469 cycles # 2.931 GHz - 9,752,169,319 instructions # 2.59 insn per cycle - 1.285199771 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 341) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.470755 sec + 4,243,036,983 cycles # 2.875 GHz + 9,962,547,463 instructions # 2.35 insn per cycle + 1.476417627 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.478889e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890818e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890818e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.324411e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.656950e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.656950e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.830635 sec -INFO: No Floating Point Exceptions have been reported - 2,356,582,684 cycles # 2.814 GHz - 5,959,230,788 instructions # 2.53 insn per cycle - 0.838030934 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1369) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.905852 sec + 2,485,591,801 cycles # 2.730 GHz + 6,187,854,977 instructions # 2.49 insn per cycle + 0.911320733 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.229956e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271002e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271002e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.055426e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.930975e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.930975e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.594206 sec -INFO: No Floating Point Exceptions have been reported - 1,695,017,656 cycles # 2.820 GHz - 3,345,002,918 instructions # 1.97 insn per cycle - 0.601755215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1499) (512y: 0) (512z: 0) +TOTAL : 0.624480 sec + 1,727,598,030 cycles # 2.746 GHz + 3,276,460,156 instructions # 1.90 insn per cycle + 0.629818884 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1543) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.272289e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.349942e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.349942e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.134099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.082651e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.082651e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.586413 sec -INFO: No Floating Point Exceptions have been reported - 1,670,913,790 cycles # 2.815 GHz - 3,318,759,581 instructions # 1.99 insn per cycle - 0.594196558 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1375) (512y: 96) (512z: 0) +TOTAL : 0.605914 sec + 1,675,699,986 cycles # 2.743 GHz + 3,304,943,330 instructions # 1.97 insn per cycle + 0.611525442 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1527) (512y: 17) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.146635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.068698e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.068698e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.946458e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.706761e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.706761e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.618076 sec -INFO: No Floating Point Exceptions have been reported - 1,426,424,228 cycles # 2.279 GHz - 2,470,718,173 instructions # 1.73 insn per cycle - 0.626622796 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 580) (512y: 60) (512z: 1021) +TOTAL : 0.653065 sec + 1,417,818,298 cycles # 2.156 GHz + 2,345,380,094 instructions # 1.65 insn per cycle + 0.658218910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 642) (512y: 25) (512z: 1138) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index 00276091a3..a869e443d5 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:08 +DATE: 2025-09-24_09:32:00 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.969963e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.427733e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.936447e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.501741e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.485765e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.567818e+07 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.519852 sec -INFO: No Floating Point Exceptions have been reported - 2,172,307,830 cycles # 2.872 GHz - 3,081,950,905 instructions # 1.42 insn per cycle - 0.813507263 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.551341 sec + 2,310,675,807 cycles # 2.827 GHz + 3,319,532,296 instructions # 1.44 insn per cycle + 0.875186841 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 50 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 94 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 -Avg ME (F77/GPU) = 0.14771956172964262 -Relative difference = 2.590743366698123e-07 +Avg ME (F77/GPU) = 0.14771956172964254 +Relative difference = 2.5907433723349327e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.156288e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.045734e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.045734e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.779674e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.732552e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.732552e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.265578 sec -INFO: No Floating Point Exceptions have been reported - 3,747,828,201 cycles # 2.946 GHz - 9,632,221,913 instructions # 2.57 insn per cycle - 1.272810702 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 359) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.460825 sec + 4,199,668,829 cycles # 2.865 GHz + 9,806,765,322 instructions # 2.34 insn per cycle + 1.466388181 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.494739e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.931280e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.931280e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.412344e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799560e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799560e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.827695 sec -INFO: No Floating Point Exceptions have been reported - 2,378,817,913 cycles # 2.850 GHz - 5,912,991,474 instructions # 2.49 insn per cycle - 0.835517705 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1340) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.855928 sec + 2,469,646,946 cycles # 2.870 GHz + 6,091,622,150 instructions # 2.47 insn per cycle + 0.861313869 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.079942e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.957305e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.957305e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.050597e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.923483e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.923483e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.628333 sec -INFO: No Floating Point Exceptions have been reported - 1,788,933,654 cycles # 2.817 GHz - 3,328,376,953 instructions # 1.86 insn per cycle - 0.635862534 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1436) (512y: 0) (512z: 0) +TOTAL : 0.624282 sec + 1,719,726,497 cycles # 2.734 GHz + 3,226,039,810 instructions # 1.88 insn per cycle + 0.629686248 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1460) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.320640e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.437091e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.437091e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.198399e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.213116e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.213116e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.574324 sec -INFO: No Floating Point Exceptions have been reported - 1,653,934,067 cycles # 2.845 GHz - 3,291,054,827 instructions # 1.99 insn per cycle - 0.581926884 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1328) (512y: 96) (512z: 0) +TOTAL : 0.590149 sec + 1,638,224,463 cycles # 2.753 GHz + 3,258,304,993 instructions # 1.99 insn per cycle + 0.595665429 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1446) (512y: 17) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.152026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.087565e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.087565e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.944452e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.700551e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.700551e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.611501 sec -INFO: No Floating Point Exceptions have been reported - 1,420,414,146 cycles # 2.296 GHz - 2,439,626,449 instructions # 1.72 insn per cycle - 0.619276325 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 547) (512y: 60) (512z: 1007) +TOTAL : 0.653709 sec + 1,422,880,012 cycles # 2.163 GHz + 2,319,628,542 instructions # 1.63 insn per cycle + 0.659181710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 585) (512y: 25) (512z: 1114) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956172964268 Relative difference = 2.59074336294025e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index bd2093b69b..8955a0ec97 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:44 +DATE: 2025-09-24_09:32:51 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.032605e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.087100e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.501992e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.658644e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.361450e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.502552e+07 ) sec^-1 MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.487961 sec -INFO: No Floating Point Exceptions have been reported - 2,048,884,733 cycles # 2.866 GHz - 2,915,076,407 instructions # 1.42 insn per cycle - 0.773529382 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 97 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.500309 sec + 2,141,761,893 cycles # 2.827 GHz + 3,051,585,121 instructions # 1.42 insn per cycle + 0.815376304 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 32 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 64 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477195e-01 -Avg ME (F77/GPU) = 0.14771956735057756 -Relative difference = 4.559355911674916e-07 +Avg ME (F77/GPU) = 0.14771956775803119 +Relative difference = 4.586938839209484e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.070270e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.039772e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.039772e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.837047e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.845359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.845359e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.246364 sec -INFO: No Floating Point Exceptions have been reported - 3,688,263,957 cycles # 2.948 GHz - 9,604,598,454 instructions # 2.60 insn per cycle - 1.251819600 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.429486 sec + 4,115,124,509 cycles # 2.870 GHz + 9,856,209,851 instructions # 2.40 insn per cycle + 1.434721281 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 520) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956094773486 Relative difference = 2.643675256627469e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.214709e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.338045e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.338045e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.915212e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.739938e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.739938e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.563106 sec -INFO: No Floating Point Exceptions have been reported - 1,636,975,072 cycles # 2.881 GHz - 3,967,404,939 instructions # 2.42 insn per cycle - 0.568812477 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1579) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.638047 sec + 1,798,711,437 cycles # 2.801 GHz + 4,006,865,720 instructions # 2.23 insn per cycle + 0.643227605 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1845) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955861942843 Relative difference = 2.80129187869649e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.994371e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.295152e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.295152e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.020131e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.370178e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.370178e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.439594 sec -INFO: No Floating Point Exceptions have been reported - 1,256,321,725 cycles # 2.826 GHz - 2,497,438,777 instructions # 1.99 insn per cycle - 0.445252542 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1924) (512y: 0) (512z: 0) +TOTAL : 0.437832 sec + 1,215,377,102 cycles # 2.751 GHz + 2,328,338,302 instructions # 1.92 insn per cycle + 0.443031597 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1759) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955698961392 Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.098864e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.632832e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.632832e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.122881e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.707659e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.707659e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.427898 sec -INFO: No Floating Point Exceptions have been reported - 1,236,536,318 cycles # 2.855 GHz - 2,473,365,360 instructions # 2.00 insn per cycle - 0.433705293 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1870) (512y: 1) (512z: 0) +TOTAL : 0.425874 sec + 1,193,251,740 cycles # 2.773 GHz + 2,344,495,568 instructions # 1.96 insn per cycle + 0.430964763 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1766) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955698961392 Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.931142e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.994223e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.994223e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.050003e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.443303e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.443303e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.448530 sec -INFO: No Floating Point Exceptions have been reported - 1,079,279,667 cycles # 2.379 GHz - 2,073,684,661 instructions # 1.92 insn per cycle - 0.454351959 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1011) (512y: 5) (512z: 1292) +TOTAL : 0.434984 sec + 1,026,532,065 cycles # 2.334 GHz + 1,868,649,986 instructions # 1.82 insn per cycle + 0.440356753 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 828) (512y: 5) (512z: 1248) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955262403935 Relative difference = 3.207154680524219e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index 2473496911..807ed3ccfb 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:56 +DATE: 2025-09-24_09:33:06 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.032625e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.129649e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.575777e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.684124e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.395383e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.536783e+07 ) sec^-1 MeanMatrixElemValue = ( 1.486732e-01 +- 3.293572e-05 ) GeV^0 -TOTAL : 0.481858 sec -INFO: No Floating Point Exceptions have been reported - 2,051,512,664 cycles # 2.885 GHz - 2,948,723,179 instructions # 1.44 insn per cycle - 0.768027645 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 86 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.504984 sec + 2,143,393,368 cycles # 2.808 GHz + 3,032,081,133 instructions # 1.41 insn per cycle + 0.820426832 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 32 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 64 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477195e-01 -Avg ME (F77/GPU) = 0.14771956525510177 -Relative difference = 4.4175008557828484e-07 +Avg ME (F77/GPU) = 0.14771956496406347 +Relative difference = 4.3977987646867276e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.212337e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.061006e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061006e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.901470e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.947681e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.947681e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 1.227579 sec -INFO: No Floating Point Exceptions have been reported - 3,620,291,769 cycles # 2.937 GHz - 9,471,544,557 instructions # 2.62 insn per cycle - 1.233302650 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 367) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.416669 sec + 4,065,194,569 cycles # 2.861 GHz + 9,713,756,382 instructions # 2.39 insn per cycle + 1.421763510 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 444) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956094773486 Relative difference = 2.643675256627469e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.220343e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350531e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350531e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.788185e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.503348e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.503348e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293563e-05 ) GeV^0 -TOTAL : 0.560958 sec -INFO: No Floating Point Exceptions have been reported - 1,637,220,191 cycles # 2.892 GHz - 3,933,324,289 instructions # 2.40 insn per cycle - 0.566799529 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1517) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.677027 sec + 1,794,955,620 cycles # 2.640 GHz + 3,955,098,557 instructions # 2.20 insn per cycle + 0.682659938 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1794) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955861942843 Relative difference = 2.80129187869649e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.995950e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312007e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312007e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.983089e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.375375e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.375375e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.438140 sec -INFO: No Floating Point Exceptions have been reported - 1,255,613,659 cycles # 2.833 GHz - 2,482,092,959 instructions # 1.98 insn per cycle - 0.443764126 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1817) (512y: 0) (512z: 0) +TOTAL : 0.442243 sec + 1,221,294,346 cycles # 2.733 GHz + 2,307,350,404 instructions # 1.89 insn per cycle + 0.447543585 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1664) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955698961392 Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.087645e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.599722e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.599722e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.117002e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.716134e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.716134e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293562e-05 ) GeV^0 -TOTAL : 0.428178 sec -INFO: No Floating Point Exceptions have been reported - 1,231,320,501 cycles # 2.843 GHz - 2,457,271,461 instructions # 2.00 insn per cycle - 0.433769891 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1773) (512y: 1) (512z: 0) +TOTAL : 0.426003 sec + 1,190,116,896 cycles # 2.764 GHz + 2,327,165,551 instructions # 1.96 insn per cycle + 0.431219526 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1677) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955698961392 Relative difference = 2.9116235141448046e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.945345e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.024652e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.024652e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.092007e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.582843e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.582843e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486735e-01 +- 3.293561e-05 ) GeV^0 -TOTAL : 0.444653 sec -INFO: No Floating Point Exceptions have been reported - 1,073,447,692 cycles # 2.387 GHz - 2,057,517,401 instructions # 1.92 insn per cycle - 0.450271011 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 906) (512y: 5) (512z: 1273) +TOTAL : 0.428488 sec + 1,017,961,260 cycles # 2.351 GHz + 1,851,270,904 instructions # 1.82 insn per cycle + 0.433719442 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 724) (512y: 5) (512z: 1232) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771955262403935 Relative difference = 3.207154680524219e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 5ae4907c26..58754172fd 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:20 +DATE: 2025-09-24_09:32:19 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.059495e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.307970e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.770458e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.505328e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.491174e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.573966e+07 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.521991 sec -INFO: No Floating Point Exceptions have been reported - 2,182,804,723 cycles # 2.882 GHz - 3,091,712,352 instructions # 1.42 insn per cycle - 0.814546737 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 130 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.550390 sec + 2,310,532,668 cycles # 2.826 GHz + 3,294,016,749 instructions # 1.43 insn per cycle + 0.874757748 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 50 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 94 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 -Avg ME (F77/GPU) = 0.14771956187351573 -Relative difference = 2.5810037581511336e-07 +Avg ME (F77/GPU) = 0.14771956611891737 +Relative difference = 2.2936077970637e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.006175e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.025890e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.025890e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.641873e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.554824e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.554824e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.286472 sec -INFO: No Floating Point Exceptions have been reported - 3,808,533,169 cycles # 2.945 GHz - 9,779,238,528 instructions # 2.57 insn per cycle - 1.294044616 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 341) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.485790 sec + 4,277,749,857 cycles # 2.870 GHz + 9,988,233,894 instructions # 2.33 insn per cycle + 1.491506902 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956645541506 Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.477969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.892042e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.892042e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.413178e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.797545e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.797545e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.834785 sec -INFO: No Floating Point Exceptions have been reported - 2,360,159,801 cycles # 2.803 GHz - 5,954,715,990 instructions # 2.52 insn per cycle - 0.842708021 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1412) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.856045 sec + 2,411,968,983 cycles # 2.802 GHz + 6,079,988,422 instructions # 2.52 insn per cycle + 0.861716549 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1674) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956645541506 Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.260391e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350498e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350498e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.119632e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.062946e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.062946e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.587860 sec -INFO: No Floating Point Exceptions have been reported - 1,670,861,769 cycles # 2.810 GHz - 3,283,918,691 instructions # 1.97 insn per cycle - 0.595426943 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1567) (512y: 0) (512z: 0) +TOTAL : 0.608749 sec + 1,686,930,401 cycles # 2.750 GHz + 3,226,597,852 instructions # 1.91 insn per cycle + 0.614233677 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1612) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.348300e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.498815e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.498815e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.188784e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.193624e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.193624e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.573278 sec -INFO: No Floating Point Exceptions have been reported - 1,645,784,221 cycles # 2.835 GHz - 3,247,832,958 instructions # 1.97 insn per cycle - 0.581347619 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1446) (512y: 101) (512z: 0) +TOTAL : 0.592651 sec + 1,642,010,663 cycles # 2.749 GHz + 3,252,569,969 instructions # 1.98 insn per cycle + 0.598011332 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1602) (512y: 19) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.143317e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.068862e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.068862e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.986612e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.779143e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.779143e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.613179 sec -INFO: No Floating Point Exceptions have been reported - 1,394,199,360 cycles # 2.248 GHz - 2,406,597,613 instructions # 1.73 insn per cycle - 0.620673412 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 768) (512y: 64) (512z: 1063) +TOTAL : 0.642604 sec + 1,399,424,953 cycles # 2.162 GHz + 2,321,995,232 instructions # 1.66 insn per cycle + 0.648127480 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 787) (512y: 29) (512z: 1168) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index 3e507cd882..63db15facd 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-10-06_09:59:32 +DATE: 2025-09-24_09:32:35 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.080757e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.449829e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.987143e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.501613e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.478366e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.559919e+07 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.520335 sec -INFO: No Floating Point Exceptions have been reported - 2,182,231,478 cycles # 2.885 GHz - 3,097,447,003 instructions # 1.42 insn per cycle - 0.813407395 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.551435 sec + 2,306,006,273 cycles # 2.825 GHz + 3,311,408,318 instructions # 1.44 insn per cycle + 0.874133523 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 50 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 94 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.477196e-01 -Avg ME (F77/GPU) = 0.14771956187351573 -Relative difference = 2.5810037581511336e-07 +Avg ME (F77/GPU) = 0.14771956611891737 +Relative difference = 2.2936077970637e-07 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.967180e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.023779e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.023779e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.719475e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.651618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.651618e+05 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 1.289771 sec -INFO: No Floating Point Exceptions have been reported - 3,794,201,935 cycles # 2.927 GHz - 9,666,542,351 instructions # 2.55 insn per cycle - 1.297077628 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 359) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.471775 sec + 4,234,745,657 cycles # 2.868 GHz + 9,831,904,342 instructions # 2.32 insn per cycle + 1.477406786 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956645541506 Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.583493e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.064503e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.064503e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.374919e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.737278e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.737278e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.784715 sec -INFO: No Floating Point Exceptions have been reported - 2,328,374,642 cycles # 2.942 GHz - 5,878,440,022 instructions # 2.52 insn per cycle - 0.792155161 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1371) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.876478 sec + 2,395,806,213 cycles # 2.719 GHz + 5,984,137,010 instructions # 2.50 insn per cycle + 0.882136028 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1646) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956645541506 Relative difference = 2.270828308707201e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.254464e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.329047e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.329047e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.126066e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.082964e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.082964e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.590226 sec -INFO: No Floating Point Exceptions have been reported - 1,689,754,472 cycles # 2.827 GHz - 3,255,343,739 instructions # 1.93 insn per cycle - 0.598325338 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1483) (512y: 0) (512z: 0) +TOTAL : 0.606254 sec + 1,679,878,737 cycles # 2.750 GHz + 3,176,290,390 instructions # 1.89 insn per cycle + 0.611543989 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1529) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.345727e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502859e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.502859e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.232873e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.291969e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.291969e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.572126 sec -INFO: No Floating Point Exceptions have been reported - 1,634,040,486 cycles # 2.820 GHz - 3,219,951,921 instructions # 1.97 insn per cycle - 0.580193189 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1382) (512y: 101) (512z: 0) +TOTAL : 0.582901 sec + 1,615,766,448 cycles # 2.750 GHz + 3,205,698,167 instructions # 1.98 insn per cycle + 0.588288106 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1521) (512y: 19) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.168828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.118471e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.118471e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.013041e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.827900e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.827900e+06 ) sec^-1 MeanMatrixElemValue = ( 1.486736e-01 +- 3.293564e-05 ) GeV^0 -TOTAL : 0.609357 sec -INFO: No Floating Point Exceptions have been reported - 1,417,478,840 cycles # 2.299 GHz - 2,399,490,515 instructions # 1.69 insn per cycle - 0.617376810 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 716) (512y: 64) (512z: 1056) +TOTAL : 0.634290 sec + 1,400,699,809 cycles # 2.193 GHz + 2,295,187,551 instructions # 1.64 insn per cycle + 0.639552302 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 730) (512y: 29) (512z: 1144) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.477196e-01 Avg ME (F77/C++) = 0.14771956674392650 Relative difference = 2.2512972893324335e-07 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 607647c622..aeaac499d3 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:56:33 +DATE: 2025-09-24_09:28:31 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.270000e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.214418e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.893995e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.399782e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.051193e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058591e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.542175 sec -INFO: No Floating Point Exceptions have been reported - 2,178,993,269 cycles # 2.803 GHz - 3,108,059,533 instructions # 1.43 insn per cycle - 0.838052893 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.626877 sec + 2,749,226,499 cycles # 2.827 GHz + 4,334,745,620 instructions # 1.58 insn per cycle + 1.033290843 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 Avg ME (F77/GPU) = 2.0158358666195562 Relative difference = 6.616631711254798e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.830273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.876984e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.876984e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.751455e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795224e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.836443 sec -INFO: No Floating Point Exceptions have been reported - 17,247,101,824 cycles # 2.952 GHz - 45,921,478,129 instructions # 2.66 insn per cycle - 5.842453521 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.095203 sec + 17,539,316,031 cycles # 2.876 GHz + 47,145,637,455 instructions # 2.69 insn per cycle + 6.100855118 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 697) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194407 Relative difference = 6.616637439061751e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.179372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.338539e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.338539e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.939396e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.076198e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.076198e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.409251 sec -INFO: No Floating Point Exceptions have been reported - 10,038,815,546 cycles # 2.940 GHz - 27,809,165,185 instructions # 2.77 insn per cycle - 3.415697404 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2537) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.676656 sec + 10,572,965,063 cycles # 2.872 GHz + 29,309,599,190 instructions # 2.77 insn per cycle + 3.682010885 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2912) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.016017e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.397611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.397611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.757789e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.113659e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.113659e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.202025 sec -INFO: No Floating Point Exceptions have been reported - 6,083,216,423 cycles # 2.757 GHz - 12,595,496,799 instructions # 2.07 insn per cycle - 2.208459235 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2620) (512y: 0) (512z: 0) +TOTAL : 2.312987 sec + 6,232,025,172 cycles # 2.689 GHz + 12,530,852,292 instructions # 2.01 insn per cycle + 2.318453338 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2739) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.491994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.947919e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.947919e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.041839e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.435080e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.435080e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.018419 sec -INFO: No Floating Point Exceptions have been reported - 5,588,215,007 cycles # 2.761 GHz - 12,004,808,489 instructions # 2.15 insn per cycle - 2.024606102 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2365) (512y: 144) (512z: 0) +TOTAL : 2.187506 sec + 5,882,467,926 cycles # 2.684 GHz + 12,196,990,510 instructions # 2.07 insn per cycle + 2.192967820 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2591) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.529303e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.713663e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.713663e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.310937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.475786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.475786e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.082414 sec -INFO: No Floating Point Exceptions have been reported - 5,763,724,377 cycles # 1.867 GHz - 8,350,228,242 instructions # 1.45 insn per cycle - 3.088879573 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1468) (512y: 122) (512z: 1806) +TOTAL : 3.275794 sec + 5,837,040,134 cycles # 1.780 GHz + 7,907,522,148 instructions # 1.35 insn per cycle + 3.281439473 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1517) (512y: 65) (512z: 1907) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index 3ed4c3c5ff..5acef4ec65 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:56:58 +DATE: 2025-09-24_09:29:07 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.306886e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.297289e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.977845e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.365587e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.046816e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054123e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.530204 sec -INFO: No Floating Point Exceptions have been reported - 2,211,323,980 cycles # 2.884 GHz - 3,201,430,578 instructions # 1.45 insn per cycle - 0.823926524 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.624519 sec + 2,779,082,474 cycles # 2.836 GHz + 4,433,471,999 instructions # 1.60 insn per cycle + 1.037558245 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 98 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 Avg ME (F77/GPU) = 2.0158358666195562 Relative difference = 6.616631711254798e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.872475e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.921622e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.921622e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.807758e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.855108e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.855108e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.706755 sec -INFO: No Floating Point Exceptions have been reported - 16,797,600,798 cycles # 2.941 GHz - 44,912,592,336 instructions # 2.67 insn per cycle - 5.712473159 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 566) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.908887 sec + 16,978,475,167 cycles # 2.872 GHz + 45,996,489,553 instructions # 2.71 insn per cycle + 5.914594704 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 634) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.376254e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.552215e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.552215e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.961290e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.101064e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.101064e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.215396 sec -INFO: No Floating Point Exceptions have been reported - 9,523,990,060 cycles # 2.957 GHz - 26,686,144,259 instructions # 2.80 insn per cycle - 3.221864250 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2326) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.650084 sec + 10,495,614,562 cycles # 2.872 GHz + 29,292,900,438 instructions # 2.79 insn per cycle + 3.655616235 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2906) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194411 Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.628485e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.953785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.953785e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.761682e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.118086e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.118086e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.376456 sec -INFO: No Floating Point Exceptions have been reported - 6,603,885,103 cycles # 2.772 GHz - 14,117,515,687 instructions # 2.14 insn per cycle - 2.382952116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2705) (512y: 0) (512z: 0) +TOTAL : 2.310179 sec + 6,236,808,714 cycles # 2.695 GHz + 12,519,643,661 instructions # 2.01 insn per cycle + 2.315615614 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2723) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.799064e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.148539e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.148539e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.065779e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.461598e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.461598e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.297050 sec -INFO: No Floating Point Exceptions have been reported - 6,386,723,525 cycles # 2.773 GHz - 13,726,619,432 instructions # 2.15 insn per cycle - 2.304339219 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2356) (512y: 298) (512z: 0) +TOTAL : 2.177250 sec + 5,873,454,407 cycles # 2.692 GHz + 12,187,790,357 instructions # 2.08 insn per cycle + 2.182588943 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2573) (512y: 47) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.339110e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504311e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504311e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.317632e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.482853e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.482853e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.254444 sec -INFO: No Floating Point Exceptions have been reported - 5,974,020,045 cycles # 1.833 GHz - 10,122,964,274 instructions # 1.69 insn per cycle - 3.261538649 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1318) (512y: 208) (512z: 1986) +TOTAL : 3.267572 sec + 5,832,481,864 cycles # 1.783 GHz + 7,898,618,772 instructions # 1.35 insn per cycle + 3.273005255 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1503) (512y: 65) (512z: 1903) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158358666194953 Relative difference = 6.616634729368461e-08 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 7bd4c9bca6..c3f3d78cd2 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:58:13 +DATE: 2025-09-24_09:30:46 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.178914e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.740854e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.866078e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.915796e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.152410e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.168379e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.492488 sec -INFO: No Floating Point Exceptions have been reported - 2,067,407,730 cycles # 2.879 GHz - 2,921,575,837 instructions # 1.41 insn per cycle - 0.777094459 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 125 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.536916 sec + 2,311,303,351 cycles # 2.820 GHz + 3,451,933,380 instructions # 1.49 insn per cycle + 0.876584729 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015841e+00 -Avg ME (F77/GPU) = 2.0158787037944421 -Relative difference = 1.870375413642407e-05 +Avg ME (F77/GPU) = 2.0158787176478654 +Relative difference = 1.8710626416180963e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.933137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.988210e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.988210e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.826941e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.876900e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.876900e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.510560 sec -INFO: No Floating Point Exceptions have been reported - 16,216,363,781 cycles # 2.940 GHz - 45,321,064,348 instructions # 2.79 insn per cycle - 5.516237540 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.827244 sec + 16,745,287,986 cycles # 2.872 GHz + 46,779,095,008 instructions # 2.79 insn per cycle + 5.832669438 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158491701586172 -Relative difference = 8.441039850630506e-08 +Avg ME (F77/C++) = 2.0158491693437099 +Relative difference = 8.400614836019365e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.554782e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.893509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.893509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.099688e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.379802e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.379802e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.389253 sec -INFO: No Floating Point Exceptions have been reported - 7,056,712,623 cycles # 2.947 GHz - 17,792,064,584 instructions # 2.52 insn per cycle - 2.395009745 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3147) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.646629 sec + 7,621,153,372 cycles # 2.875 GHz + 18,459,205,647 instructions # 2.42 insn per cycle + 2.651770005 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3490) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158486895961687 -Relative difference = 1.539816876576819e-07 +Avg ME (F77/C++) = 2.0158486870350316 +Relative difference = 1.5525218811688918e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.351394e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.496890e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.496890e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.529214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.733909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.733909e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.343765 sec -INFO: No Floating Point Exceptions have been reported - 3,745,450,403 cycles # 2.777 GHz - 8,262,540,860 instructions # 2.21 insn per cycle - 1.349671424 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3371) (512y: 0) (512z: 0) +TOTAL : 1.317398 sec + 3,571,029,960 cycles # 2.701 GHz + 7,635,561,961 instructions # 2.14 insn per cycle + 1.322807202 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3265) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015847e+00 -Avg ME (F77/C++) = 2.0158474864438176 -Relative difference = 2.4130988992271984e-07 +Avg ME (F77/C++) = 2.0158474886557087 +Relative difference = 2.4240714140793267e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.821818e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.011140e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.011140e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.907154e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.022942e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.022942e+06 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.275053 sec -INFO: No Floating Point Exceptions have been reported - 3,558,622,083 cycles # 2.780 GHz - 7,915,407,710 instructions # 2.22 insn per cycle - 1.280856743 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3214) (512y: 20) (512z: 0) +TOTAL : 1.264760 sec + 3,427,937,601 cycles # 2.701 GHz + 7,479,564,971 instructions # 2.18 insn per cycle + 1.270093065 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3174) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015847e+00 -Avg ME (F77/C++) = 2.0158474864438176 -Relative difference = 2.4130988992271984e-07 +Avg ME (F77/C++) = 2.0158474886557087 +Relative difference = 2.4240714140793267e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.584138e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.256759e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.256759e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.772456e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.505125e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.505125e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.679646 sec -INFO: No Floating Point Exceptions have been reported - 3,255,689,642 cycles # 1.933 GHz - 6,101,216,288 instructions # 1.87 insn per cycle - 1.685383243 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2258) (512y: 22) (512z: 2156) +TOTAL : 1.637116 sec + 3,057,489,541 cycles # 1.863 GHz + 5,257,903,588 instructions # 1.72 insn per cycle + 1.642609723 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2082) (512y: 5) (512z: 2093) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015848e+00 -Avg ME (F77/C++) = 2.0158476348733529 -Relative difference = 1.8112806478434436e-07 +Avg ME (F77/C++) = 2.0158476332435384 +Relative difference = 1.8193656547763924e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index bd2def4f48..04242d0363 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,236 +10,215 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:58:33 +DATE: 2025-09-24_09:31:13 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.136229e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.747823e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880709e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.924572e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.163960e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.180163e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 -TOTAL : 0.488528 sec -INFO: No Floating Point Exceptions have been reported - 2,057,813,122 cycles # 2.874 GHz - 2,903,563,490 instructions # 1.41 insn per cycle - 0.774040886 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.531938 sec + 2,300,025,009 cycles # 2.829 GHz + 3,436,350,240 instructions # 1.49 insn per cycle + 0.870280136 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 68 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 48 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015841e+00 -Avg ME (F77/GPU) = 2.0158787037944421 -Relative difference = 1.870375413642407e-05 +Avg ME (F77/GPU) = 2.0158787176478654 +Relative difference = 1.8710626416180963e-05 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.970300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.026987e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.026987e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.856598e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.908262e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.908262e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 -TOTAL : 5.407589 sec -INFO: No Floating Point Exceptions have been reported - 15,991,185,925 cycles # 2.955 GHz - 44,429,993,623 instructions # 2.78 insn per cycle - 5.412895968 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 533) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.734798 sec + 16,473,045,879 cycles # 2.870 GHz + 45,850,587,770 instructions # 2.78 insn per cycle + 5.740170772 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158491701586172 -Relative difference = 8.441039850630506e-08 +Avg ME (F77/C++) = 2.0158491693437099 +Relative difference = 8.400614836019365e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.328908e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.798682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.798682e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.096624e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.376244e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.376244e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 -TOTAL : 2.053409 sec -INFO: No Floating Point Exceptions have been reported - 6,061,427,520 cycles # 2.945 GHz - 17,076,312,832 instructions # 2.82 insn per cycle - 2.059026016 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2862) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.647878 sec + 7,614,623,600 cycles # 2.871 GHz + 18,447,507,149 instructions # 2.42 insn per cycle + 2.653267513 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3483) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015849e+00 -Avg ME (F77/C++) = 2.0158486895961687 -Relative difference = 1.539816876576819e-07 +Avg ME (F77/C++) = 2.0158486870350316 +Relative difference = 1.5525218811688918e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.019252e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.594125e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.594125e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.522002e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.728728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.728728e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.827330 sec -INFO: No Floating Point Exceptions have been reported - 5,036,041,688 cycles # 2.749 GHz - 10,223,391,747 instructions # 2.03 insn per cycle - 1.833165934 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3906) (512y: 0) (512z: 0) +TOTAL : 1.317072 sec + 3,572,319,102 cycles # 2.703 GHz + 7,627,371,635 instructions # 2.14 insn per cycle + 1.322524895 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015847e+00 -Avg ME (F77/C++) = 2.0158474864438176 -Relative difference = 2.4130988992271984e-07 +Avg ME (F77/C++) = 2.0158474886557087 +Relative difference = 2.4240714140793267e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.156943e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.756865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.756865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.915944e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.023940e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.023940e+06 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 1.789449 sec -INFO: No Floating Point Exceptions have been reported - 4,972,642,094 cycles # 2.772 GHz - 9,995,367,434 instructions # 2.01 insn per cycle - 1.795052964 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3805) (512y: 2) (512z: 0) +TOTAL : 1.262360 sec + 3,416,193,686 cycles # 2.696 GHz + 7,470,998,329 instructions # 2.19 insn per cycle + 1.267574585 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3149) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015847e+00 -Avg ME (F77/C++) = 2.0158474864438176 -Relative difference = 2.4130988992271984e-07 +Avg ME (F77/C++) = 2.0158474886557087 +Relative difference = 2.4240714140793267e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.670992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.000057e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.000057e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.765679e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.496653e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.496653e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 -TOTAL : 2.331763 sec -INFO: No Floating Point Exceptions have been reported - 4,369,500,962 cycles # 1.870 GHz - 8,444,287,674 instructions # 1.93 insn per cycle - 2.337616992 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2744) (512y: 4) (512z: 2754) +TOTAL : 1.637363 sec + 3,050,184,653 cycles # 1.858 GHz + 5,250,274,198 instructions # 1.72 insn per cycle + 1.642488720 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2054) (512y: 5) (512z: 2092) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015848e+00 -Avg ME (F77/C++) = 2.0158476348733529 -Relative difference = 1.8112806478434436e-07 +Avg ME (F77/C++) = 2.0158476332435384 +Relative difference = 1.8193656547763924e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 9029ad668b..7c64f69cf6 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:57:23 +DATE: 2025-09-24_09:29:40 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.278122e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.299718e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.972605e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.407003e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.051625e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058988e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.535533 sec -INFO: No Floating Point Exceptions have been reported - 2,218,013,615 cycles # 2.871 GHz - 3,167,587,965 instructions # 1.43 insn per cycle - 0.830721869 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.626628 sec + 2,745,317,317 cycles # 2.834 GHz + 4,375,733,633 instructions # 1.59 insn per cycle + 1.030270764 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 97 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358639104246 -Relative difference = 6.751024171044779e-08 +Avg ME (F77/GPU) = 2.0158359252211273 +Relative difference = 3.709571258359381e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.807535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.852925e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.852925e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.733243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.776197e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.776197e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.910224 sec -INFO: No Floating Point Exceptions have been reported - 17,388,420,068 cycles # 2.940 GHz - 46,077,588,135 instructions # 2.65 insn per cycle - 5.916245730 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.158417 sec + 17,715,939,856 cycles # 2.875 GHz + 47,246,785,246 instructions # 2.67 insn per cycle + 6.164118051 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 697) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359218686011 Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.226882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.387878e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.387878e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.957728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.098130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.098130e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.360909 sec -INFO: No Floating Point Exceptions have been reported - 9,940,043,952 cycles # 2.953 GHz - 27,598,360,403 instructions # 2.78 insn per cycle - 3.367569953 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.654698 sec + 10,452,582,819 cycles # 2.857 GHz + 28,908,318,583 instructions # 2.77 insn per cycle + 3.660369759 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2952) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359218686011 Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.038546e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.426797e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.426797e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.875920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.250078e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.250078e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.194996 sec -INFO: No Floating Point Exceptions have been reported - 6,084,814,623 cycles # 2.765 GHz - 12,511,133,896 instructions # 2.06 insn per cycle - 2.201688699 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2776) (512y: 0) (512z: 0) +TOTAL : 2.259915 sec + 6,093,808,686 cycles # 2.691 GHz + 12,337,204,259 instructions # 2.02 insn per cycle + 2.265459762 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2881) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.589922e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.068248e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.068248e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.177033e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.591064e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.591064e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 1.988387 sec -INFO: No Floating Point Exceptions have been reported - 5,540,380,764 cycles # 2.778 GHz - 11,938,541,192 instructions # 2.15 insn per cycle - 1.995322896 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 146) (512z: 0) +TOTAL : 2.133158 sec + 5,748,650,852 cycles # 2.689 GHz + 11,995,867,966 instructions # 2.09 insn per cycle + 2.138612056 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2715) (512y: 49) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.615006e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.807457e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.807457e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.338127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.505246e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505246e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.015683 sec -INFO: No Floating Point Exceptions have been reported - 5,630,115,254 cycles # 1.863 GHz - 8,130,918,173 instructions # 1.44 insn per cycle - 3.022730001 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1671) (512y: 126) (512z: 1865) +TOTAL : 3.249823 sec + 5,782,616,223 cycles # 1.777 GHz + 7,825,984,086 instructions # 1.35 insn per cycle + 3.255283004 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1669) (512y: 69) (512z: 1937) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 44aa1a6a94..e76b553aa7 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -1,5 +1,8 @@ +MADGRAPH_CUDA_ARCHITECTURE= +MADGRAPH_HIP_ARCHITECTURE= -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +HASBLAS=hasBlas +Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx BACKEND=cpp512y (was cppauto) OMPFLAGS= FPTYPE='m' @@ -7,233 +10,212 @@ HELINL='0' HRDCOD='0' HASCURAND=hasCurand HASHIPRAND=hasNoHiprand +HASBLAS=hasBlas Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. make USEBUILDDIR=1 BACKEND=cuda -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-10-06_09:57:48 +DATE: 2025-09-24_09:30:13 +HASBLAS=hasBlas +CUDACPP_RUNTIME_BLASCOLORSUM= +CUDACPP_RUNTIME_CUBLASTF32TENSOR= On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.308177e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.314026e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.965515e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.373817e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.047737e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055011e+07 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 0.530653 sec -INFO: No Floating Point Exceptions have been reported - 2,220,013,015 cycles # 2.891 GHz - 3,185,773,009 instructions # 1.44 insn per cycle - 0.824701846 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 212 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.624694 sec + 2,775,081,370 cycles # 2.833 GHz + 4,447,666,860 instructions # 1.60 insn per cycle + 1.037097767 seconds time elapsed +......................................................................... +runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "diagram1": launch__registers_per_thread 98 +==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "diagram2": launch__registers_per_thread 70 +==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100% +==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26 +==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0% ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.015836e+00 -Avg ME (F77/GPU) = 2.0158358639104246 -Relative difference = 6.751024171044779e-08 +Avg ME (F77/GPU) = 2.0158359252211273 +Relative difference = 3.709571258359381e-08 OK (relative difference <= 5E-3) ========================================================================= -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe +Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.857128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.905464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.905464e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.787280e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.833233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.833233e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 5.753526 sec -INFO: No Floating Point Exceptions have been reported - 16,958,834,547 cycles # 2.945 GHz - 45,095,701,979 instructions # 2.66 insn per cycle - 5.759360611 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.974558 sec + 17,164,367,416 cycles # 2.871 GHz + 46,097,766,780 instructions # 2.69 insn per cycle + 5.980350288 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 634) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359218686011 Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.365466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.544754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.544754e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.980851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.122264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.122264e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.232551 sec -INFO: No Floating Point Exceptions have been reported - 9,533,065,833 cycles # 2.943 GHz - 26,273,852,197 instructions # 2.76 insn per cycle - 3.239846074 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2386) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.626612 sec + 10,414,531,441 cycles # 2.869 GHz + 28,891,877,424 instructions # 2.77 insn per cycle + 3.632098254 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2946) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359218686011 Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.514012e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.821697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.821697e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.880045e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.253649e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.253649e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.435584 sec -INFO: No Floating Point Exceptions have been reported - 6,758,526,375 cycles # 2.768 GHz - 14,047,168,742 instructions # 2.08 insn per cycle - 2.442338814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2895) (512y: 0) (512z: 0) +TOTAL : 2.257801 sec + 6,087,823,639 cycles # 2.691 GHz + 12,326,159,258 instructions # 2.02 insn per cycle + 2.263202472 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2865) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.791737e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.138604e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.138604e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.183890e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.604893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.604893e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 2.301242 sec -INFO: No Floating Point Exceptions have been reported - 6,403,253,635 cycles # 2.776 GHz - 13,529,712,107 instructions # 2.11 insn per cycle - 2.307614270 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2531) (512y: 302) (512z: 0) +TOTAL : 2.129962 sec + 5,750,241,658 cycles # 2.694 GHz + 11,985,569,411 instructions # 2.08 insn per cycle + 2.135441578 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2697) (512y: 49) (512z: 0) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.627313e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.823087e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.823087e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.334001e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.500633e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.500633e+05 ) sec^-1 MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 -TOTAL : 3.002431 sec -INFO: No Floating Point Exceptions have been reported - 5,614,669,392 cycles # 1.866 GHz - 9,218,497,811 instructions # 1.64 insn per cycle - 3.009264991 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1456) (512y: 212) (512z: 2059) +TOTAL : 3.252743 sec + 5,780,173,629 cycles # 1.775 GHz + 7,818,044,121 instructions # 1.35 insn per cycle + 3.258145265 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1655) (512y: 69) (512z: 1933) ------------------------------------------------------------------------- -runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe -INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 4 tests. DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 } -INFO: No Floating Point Exceptions have been reported DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 } -INFO: No Floating Point Exceptions have been reported ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.015836e+00 Avg ME (F77/C++) = 2.0158359178371690 Relative difference = 4.0758688308634e-08 diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh index 088371cb95..c4180b6725 100755 --- a/epochX/cudacpp/tput/teeThroughputX.sh +++ b/epochX/cudacpp/tput/teeThroughputX.sh @@ -1,8 +1,8 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. scrdir=$(cd $(dirname $0); pwd) bckend=$(basename $(cd $scrdir; cd ..; pwd)) # cudacpp or alpaka @@ -10,7 +10,7 @@ cd $scrdir function usage() { - echo "Usage: $0 [-nocuda] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-makeonly] [-makeclean] [-makej] [-dlp ]" # -nofpe is no longer supported + echo "Usage: $0 [-nocuda] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-noBlas|-blasOn] [-makeonly] [-makeclean] [-makej] [-scaling] [-dlp ]" # -nofpe is no longer supported exit 1 } @@ -33,8 +33,10 @@ helinls="0" hrdcods="0" rndgen= rmbsmp= +blas="" # build with blas but disable it at runtime steps="make test" makej= +scaling= ###nofpe= dlp= dlpset=0 @@ -117,6 +119,12 @@ for arg in $*; do rmbsmp=$arg elif [ "$arg" == "-bridge" ]; then rmbsmp=$arg + elif [ "$arg" == "-noBlas" ]; then # build with blas but disable it at runtime + if [ "${blas}" == "-blasOn" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi + blas=$arg + elif [ "$arg" == "-blasOn" ]; then # build with blas and enable it at runtime + if [ "${blas}" == "-noBlas" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi + blas=$arg elif [ "$arg" == "-makeonly" ]; then if [ "${steps}" == "make test" ]; then steps="make" @@ -131,6 +139,8 @@ for arg in $*; do fi elif [ "$arg" == "-makej" ]; then makej=-makej + elif [ "$arg" == "-scaling" ]; then + scaling=$arg ###elif [ "$arg" == "-nofpe" ]; then ### nofpe=-nofpe else @@ -175,6 +185,8 @@ for step in $steps; do args="${args} ${alpaka}" # optionally disable alpaka tests args="${args} ${rndgen}" # optionally use common random numbers or curand on host args="${args} ${rmbsmp}" # optionally use rambo or bridge on host + args="${args} ${scaling}" # optionally run scaling tests + args="${args} ${blas}" # optionally build with no blas or instead enable it at runtime ###args="${args} ${nofpe}" # optionally disable FPEs args="${args} ${bldall}" # avx, fptype, helinl and hrdcod are now supported for all processes if [ "${step}" == "makeclean" ]; then @@ -191,6 +203,8 @@ for step in $steps; do logfile=logs_${proc#-}_${sufflog}/log_${proc#-}_${sufflog}_${fptype}_inl${helinl}_hrd${hrdcod}.txt if [ "${rndgen}" != "" ]; then logfile=${logfile%.txt}_${rndgen#-}.txt; fi if [ "${rmbsmp}" != "" ]; then logfile=${logfile%.txt}_${rmbsmp#-}.txt; fi + if [ "${blas}" != "" ]; then logfile=${logfile%.txt}_${blas#-}.txt; fi + if [ "${scaling}" != "" ]; then logfile=${logfile%.txt}.scaling; fi printf "\n%80s\n" |tr " " "*" printf "*** ./throughputX.sh $args | tee $logfile" printf "\n%80s\n" |tr " " "*" diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index 68df662f58..78316f963b 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -1,8 +1,8 @@ #!/bin/bash -# Copyright (C) 2020-2024 CERN and UCLouvain. +# Copyright (C) 2020-2025 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin. set +x # not verbose set -e # fail on error @@ -19,7 +19,7 @@ export MG5AMC_CHANNELID_DEBUG=1 function usage() { - echo "Usage: $0 [-bldall|-nocuda|-cpponly|-cudaonly|-hiponly|-noneonly|-sse4only|-avx2only|-512yonly|-512zonly] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-omp] [-makeonly|-makeclean|-makecleanonly|-dryrun] [-makej] [-3a3b] [-div] [-req] [-detailed] [-gtest(default)|-nogtest] [-v] [-dlp ]" # -nofpe is no longer supported + echo "Usage: $0 [-bldall|-nocuda|-cpponly|-cudaonly|-hiponly|-noneonly|-sse4only|-avx2only|-512yonly|-512zonly] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-noBlas|-blasOn] [-omp] [-makeonly|-makeclean|-makecleanonly|-dryrun] [-makej] [-3a3b] [-div] [-req] [-detailed] [-gtest(default)|-nogtest] [-scaling] [-v] [-dlp ]" # -nofpe is no longer supported exit 1 } @@ -49,7 +49,9 @@ fptypes="m" # new default #995 (was "d") helinls="0" hrdcods="0" rndgen="" -rmbsam="" +rmbsmp="" + +blas="" # build with blas but disable it at runtime maketype= makej= @@ -59,6 +61,7 @@ div=0 req=0 detailed=0 gtest= +scaling=0 ###nofpe=0 verbose=0 @@ -211,6 +214,14 @@ while [ "$1" != "" ]; do elif [ "$1" == "-bridge" ]; then rmbsmp=" -${1}" shift + elif [ "$1" == "-noBlas" ]; then # build without blas + if [ "${blas}" == "-blasOn" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi + blas=$1 + shift + elif [ "$1" == "-blasOn" ]; then # build with blas and enable it at runtime + if [ "${blas}" == "-noBlas" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi + blas=$1 + shift elif [ "$1" == "-makeonly" ] || [ "$1" == "-makeclean" ] || [ "$1" == "-makecleanonly" ] || [ "$1" == "-dryrun" ]; then if [ "${maketype}" != "" ] && [ "${maketype}" != "$1" ]; then echo "ERROR! Options -makeonly, -makeclean, -makecleanonly and -dryrun are incompatible"; usage @@ -245,6 +256,9 @@ while [ "$1" != "" ]; do fi gtest=0 shift + elif [ "$1" == "-scaling" ]; then + scaling=1 + shift ###elif [ "$1" == "-nofpe" ]; then ### nofpe=1 ### shift @@ -371,6 +385,9 @@ function showdir() echo $dir } +echo MADGRAPH_CUDA_ARCHITECTURE=${MADGRAPH_CUDA_ARCHITECTURE} +echo MADGRAPH_HIP_ARCHITECTURE=${MADGRAPH_HIP_ARCHITECTURE} + ###echo -e "\n********************************************************************************\n" printf "\n" @@ -434,6 +451,13 @@ done # PART 2 - build the executables which should be run ########################################################################## +if [ "${blas}" == "-noBlas" ]; then + export HASBLAS=hasNoBlas +else + export HASBLAS=hasBlas +fi +echo HASBLAS=${HASBLAS} + unset GTEST_ROOT unset LOCALGTEST @@ -497,6 +521,18 @@ if [ "${maketype}" != "-dryrun" ]; then printf "DATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n" fi +echo HASBLAS=${HASBLAS} + +if [ "${blas}" == "-blasOn" ]; then + export CUDACPP_RUNTIME_BLASCOLORSUM=1 +else + unset CUDACPP_RUNTIME_BLASCOLORSUM +fi +echo CUDACPP_RUNTIME_BLASCOLORSUM=${CUDACPP_RUNTIME_BLASCOLORSUM} + +unset CUDACPP_RUNTIME_CUBLASTF32TENSOR +echo CUDACPP_RUNTIME_CUBLASTF32TENSOR=${CUDACPP_RUNTIME_CUBLASTF32TENSOR} + function runExe() { exe1=$1 args="$2" @@ -507,6 +543,7 @@ function runExe() { # Optionally add other patterns here for some specific configurations (e.g. clang) if [ "${exe1%%/check_cuda*}" != "${exe1}" ] || [ "${exe1%%/check_hip*}" != "${exe1}" ]; then pattern="${pattern}|EvtsPerSec\[Matrix"; fi pattern="${pattern}|Workflow" + ###pattern="${pattern}|BLASCOLORSUM" ###pattern="${pattern}|CUCOMPLEX" ###pattern="${pattern}|COMMON RANDOM|CURAND HOST \(CUDA" pattern="${pattern}|ERROR" @@ -523,7 +560,7 @@ function runExe() { if [ "${detailed}" == "1" ]; then pattern="${pattern}|#"; fi if [ "${verbose}" == "1" ]; then set -x; fi ###perf stat -d $exe1 $args 2>&1 | grep -v "Performance counter stats" - perf stat -d $exe1 $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats" + perf stat -d $exe1 $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/' set +x else # -- Older version using time @@ -539,6 +576,7 @@ function runTest() { echo "runTest $exe1" if [ "${maketype}" == "-dryrun" ]; then return; fi pattern="PASS|FAIL" + ###pattern="${pattern}|BLASCOLORSUM" pattern="${pattern}|ERROR" pattern="${pattern}|WARNING" pattern="${pattern}|Floating Point Exception" @@ -563,10 +601,12 @@ function cmpExe() { echo "ERROR! C++ calculation (C++${tag} failed"; exit 1 # expose FPE crash #1003 on HIP fi me1=$(cat ${tmp1} | grep MeanMatrix | awk '{print $4}'); cat ${tmp2} + ###cat ${tmp1} | grep BLASCOLORSUM if ! ${exef} ${argsf} 2>${tmp2} >${tmp1}; then echo "ERROR! Fortran calculation (F77${tag} failed"; exit 1 fi me2=$(cat ${tmp1} | grep Average | awk '{print $4}'); cat ${tmp2} + ###cat ${tmp1} | grep BLASCOLORSUM echo -e "Avg ME (C++${tag} = ${me1}\nAvg ME (F77${tag} = ${me2}" if [ "${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77${tag} returned NaN"; exit 1 @@ -588,16 +628,24 @@ function runNcu() { args="$2" args="$args$rndgen$rmbsmp" echo "runNcu $exe1 $args" - if [ "${verbose}" == "1" ]; then set -x; fi - #$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}' - set +e # do not fail on error - out=$($(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args) - echo "$out" | egrep '(ERROR|WARNING)' # NB must escape $out in between quotes - set -e # fail on error (after ncu and after egrep!) - out=$(echo "${out}" | egrep '(sigmaKin|registers| sm)' | tr "\n" " ") # NB must escape $out in between quotes - echo $out | awk -v key1="launch__registers_per_thread" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)}; print $1, $2, $3, key1, val1}' - echo $out | awk -v key1="sm__sass_average_branch_targets_threads_uniform.pct" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)$(i+1)}; print $1, $2, $3, key1, val1}' - set +x + ###echoblas=1 + ###kernels="calculate_jamps color_sum_kernel" # before kernel splitting + kernels="diagram1 diagram2 color_sum_kernel" # with kernel splitting + ###if [ "${CUDACPP_RUNTIME_BLASCOLORSUM}" == "1" ]; then kernels="$kernels kernel"; fi # heavy to profile... + ###if [ "${CUDACPP_RUNTIME_BLASCOLORSUM}" == "1" ]; then kernels="$kernels regex:gemm"; fi # output to be improved... + for kernel in $kernels; do + if [ "${verbose}" == "1" ]; then set -x; fi + #$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::${kernel}:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}' + set +e # do not fail on error + out=$($(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::${kernel}:" --kernel-name-base function $exe1 $args) + echo "$out" | egrep '(ERROR|WARNING)' # NB must escape $out in between quotes + ###if [ "${echoblas}" == "1" ]; then echo "$out" | egrep '(BLASCOLORSUM)'; echoblas=0; fi + set -e # fail on error (after ncu and after egrep!) + out=$(echo "${out}" | egrep "(${kernel}|registers| sm)" | tr "\n" " ") # NB must escape $out in between quotes + echo $out | awk -v key1="launch__registers_per_thread" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)}; print $1, $2, $3, key1, val1}' + echo $out | awk -v key1="sm__sass_average_branch_targets_threads_uniform.pct" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)$(i+1)}; print $1, $2, $3, key1, val1}' + set +x + done } # Profile divergence metrics more in detail @@ -613,11 +661,11 @@ function runNcuDiv() { ###echo "runNcuDiv $exe1 $args" if [ "${verbose}" == "1" ]; then set -x; fi ###$(which ncu) --query-metrics $exe1 $args - ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args - ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args - ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}' - #$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}' - out=$($(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " ") + ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args + ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args + ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}' + #$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}' + out=$($(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " ") ###echo $out echo $out | awk -v key1="smsp__sass_branch_targets.sum" '{key2=key1".per_second"; val1="N/A"; val2=""; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+1); if ($i==key2 && $(i+1)!="(!)") val2=$(i+2)$(i+1)}; printf "%29s: %-51s %-10s %s\n", "", key1, val1, val2}' echo $out | awk -v key1="smsp__sass_branch_targets_threads_uniform.sum" '{key2=key1".per_second"; val1="N/A"; val2=""; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+1); if ($i==key2 && $(i+1)!="(!)") val2=$(i+2)$(i+1)}; printf "%29s: %-51s %-10s %s\n", "", key1, val1, val2}' @@ -637,7 +685,7 @@ function runNcuReq() { for args in "-p 1 1 1" "-p 1 4 1" "-p 1 8 1" "-p 1 32 1" "$ncuArgs"; do ###echo "runNcuReq $exe1 $args" # NB This will print nothing if $args are invalid (eg "-p 1 4 1" when neppR=8) - $(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17";", $19"s", $20, tag}' + $(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17";", $19"s", $20, tag}' done set +x } @@ -659,10 +707,19 @@ else fi echo -e "On $HOSTNAME [CPU: $cpuTxt] [GPU: $gpuTxt]:" +# Configure scaling tests +if [ "${scaling}" == "0" ]; then # no scaling tests (throughput tests only) + exesSc= +elif [ "${scaling}" == "1" ]; then # scaling tests only (skip throughput tests) + exesSc=$exes + exes= +fi + # These two settings are needed by BMK containers: do not change them BMKEXEARGS="" # if BMKEXEARGS is set, exeArgs is set equal to BMKEXEARGS, while exeArgs2 is set to "" BMKMULTIPLIER=1 # the pre-defined numbers of iterations (including those in BMKEXEARGS) are multiplied by BMKMULTIPLIER +# (1) TRADITIONAL THROUGHPUT TESTS ###lastExe= lastExeDir= ###echo "exes=$exes" @@ -699,7 +756,7 @@ for exe in $exes; do exeArgs="-p 1 256 2" ncuArgs="-p 1 256 1" # For smeftggtttt, use the same settings as for ggttggg (may be far too short!) - exeArgs2="-p 64 256 1" + ###exeArgs2="-p 64 256 1" # Sep 2025: aborts (and not needed as the plateau is reached earlier) with helicity streams and diagram kernel splitting elif [ "${exe%%/susy_gg_tt*}" != "${exe}" ]; then # For susyggtt, use the same settings as for SM ggtt exeArgs="-p 2048 256 2" @@ -713,27 +770,27 @@ for exe in $exes; do exeArgs="-p 64 256 10" ncuArgs="-p 64 256 1" # For gqttq, use the same settings as for ggttg - exeArgs2="-p 2048 256 1" + ###exeArgs2="-p 2048 256 1" # Sep 2025: remove this for consistency as all others were removed (and not needed as the plateau is reached earlier) elif [ "${exe%%/gg_ttggg*}" != "${exe}" ]; then # For ggttggg: this is far too little for GPU (4.8E2), but it keeps the CPU to a manageble level (1sec with 512y) ###exeArgs="-p 1 256 1" # too short! see https://its.cern.ch/jira/browse/BMK-1056 exeArgs="-p 1 256 2" ncuArgs="-p 1 256 1" # For ggttggg: on GPU test also "64 256" to reach the plateau (only ~5% lower than "2048 256": 1.18E4 vs 1.25E4 on cuda116/gcc102) - exeArgs2="-p 64 256 1" + ###exeArgs2="-p 64 256 1" # Sep 2025: aborts (and not needed as the plateau is reached earlier) with helicity streams and diagram kernel splitting elif [ "${exe%%/gg_ttgg*}" != "${exe}" ]; then # For ggttgg (OLD): this is a good GPU middle point: tput is 1.5x lower with "32 256 1", only a few% higher with "128 256 1" exeArgs="-p 64 256 1" ncuArgs="-p 64 256 1" # For ggttgg (NEW): on GPU test both "64 256" and "2048 256" for ggttgg as the latter gives ~10% higher throughput on cuda110/gcc92 - exeArgs2="-p 2048 256 1" + ###exeArgs2="-p 2048 256 1" # Sep 2025: aborts (and not needed as the plateau is reached earlier) with helicity streams elif [ "${exe%%/gg_ttg*}" != "${exe}" ]; then # For ggttg, as on ggttgg: this is a good GPU middle point: tput is 1.5x lower with "32 256 1", only a few% higher with "128 256 1" ###exeArgs="-p 64 256 1" # too short! see https://its.cern.ch/jira/browse/BMK-1056 exeArgs="-p 64 256 10" ncuArgs="-p 64 256 1" # For ggttg, as on ggttgg: on GPU test both "64 256" and "2048 256" for ggttg as the latter gives ~10% higher throughput on cuda110/gcc92 - exeArgs2="-p 2048 256 1" + ###exeArgs2="-p 2048 256 1" # Sep 2025: aborts (and not needed as the plateau is reached earlier) with helicity streams and diagram kernel splitting elif [ "${exe%%/gg_tt*}" != "${exe}" ]; then ###exeArgs="-p 2048 256 1" # too short! see https://its.cern.ch/jira/browse/BMK-1056 exeArgs="-p 2048 256 2" @@ -760,9 +817,16 @@ for exe in $exes; do unset OMP_NUM_THREADS fi elif [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]] || [ "${exe%%/alpcheck*}" != "${exe}" ]; then + echo "........................................................................." runNcu $exe "$ncuArgs" - if [ "${div}" == "1" ]; then runNcuDiv $exe; fi - if [ "${req}" == "1" ]; then runNcuReq $exe "$ncuArgs"; fi + if [ "${div}" == "1" ]; then + echo "........................................................................." + runNcuDiv $exe + fi + if [ "${req}" == "1" ]; then + echo "........................................................................." + runNcuReq $exe "$ncuArgs" + fi if [ "${exeArgs2}" != "" ]; then echo "........................................................................."; runExe $exe "$exeArgs2"; fi fi if [ "${gtest}" == "1" ]; then @@ -777,6 +841,46 @@ for exe in $exes; do cmpExe $exe fi done +###echo "=========================================================================" + +# (2) SCALING TESTS +lastExeDir= +for exe in $exesSc; do + if [ "$(basename $(dirname $exe))" != "$lastExeDir" ]; then + echo "=========================================================================" + lastExeDir=$(basename $(dirname $exe)) + else + echo "-------------------------------------------------------------------------" + fi + echo "scalingTest $exe" + if [ ! -f $exe ]; then echo "Not found: $exe"; continue; fi + if [ "${unamep}" != "x86_64" ]; then + if [ "${exe/build.avx2}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi + if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi + if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi + elif [ "${unames}" == "Darwin" ]; then + if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported on ${unames}"; continue; fi + if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported on ${unames}"; continue; fi + elif [ "$(grep -m1 -c avx512vl /proc/cpuinfo)" != "1" ]; then + if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi + if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi + fi + exeDir=$(dirname $exe) + cd $exeDir/.. # workaround for reading '../../Cards/param_card.dat' without setting MG5AMC_CARD_PATH + unset OMP_NUM_THREADS + # Scaling test with 256 threads per block + if [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]]; then + echo "### GPU: scaling test 256" + for b in 1 2 4 8 16 32 64 128 256 512 1024; do ( $exe -p $b 256 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 256}" ) |& sed "s/Gpu.*Assert/Assert/" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'; done + echo "### GPU: scaling test 32" + for b in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192; do ( $exe -p $b 32 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 32}" ) |& sed "s/Gpu.*Assert/Assert/" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'; done + else + echo "### CPU: scaling test 256" + for b in 1 2 4; do ( $exe -p $b 256 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 256}" ); done + echo "### CPU: scaling test 32" + for b in 1 2 4; do ( $exe -p $b 32 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 32}" ); done + fi +done echo "=========================================================================" # Workaround for reading of data files

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "

\ Postscript Diagrams for $proc\<\/A\> \ \n"; for($j=1;$j<$pages;$j++){ - print PAGE "\\"Page \ \n"; + print PAGE "\\"Page \ \n"; }#end of for # -# In case I didn't include all of the diagrams as jpeg, warn user +# In case I didn't include all of the diagrams as PNG, warn user # - if (-e "matrix$imatrix$max_jpg.jpg" ) { - print PAGE "